Merge "Fixing constant value used to calculate frame pts and duration."
diff --git a/README b/README
index ce9c1c6..6d7d5ec 100644
--- a/README
+++ b/README
@@ -12,22 +12,20 @@
 
     * All x86 targets require the Yasm[1] assembler be installed.
     * All Windows builds require that Cygwin[2] be installed.
-    * Building the documentation requires PHP[3] and Doxygen[4]. If you do not
-      have these packages, you must pass --disable-install-docs to the
-      configure script.
-    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+    * Building the documentation requires Doxygen[3]. If you do not
+      have this package, the install-docs option will be disabled.
+    * Downloading the data for the unit tests requires curl[4] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[6]. These requirements are optional if not running the unit
+      source[5]. These requirements are optional if not running the unit
       tests.
 
     [1]: http://www.tortall.net/projects/yasm
     [2]: http://www.cygwin.com
-    [3]: http://php.net
-    [4]: http://www.doxygen.org
-    [5]: http://curl.haxx.se
-    [6]: http://www.microbrew.org/tools/md5sha1sum/
+    [3]: http://www.doxygen.org
+    [4]: http://curl.haxx.se
+    [5]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 48a0dd7..826ff2f 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -53,12 +53,23 @@
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
 ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
 
-# Makefiles created by the libvpx configure process
-# This will need to be fixed to handle x86.
+# Use the makefiles generated by upstream configure to determine which files to
+# build. Also set any architecture-specific flags.
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
-else
+  LOCAL_ARM_MODE := arm
+else ifeq  ($(TARGET_ARCH_ABI),armeabi)
   include $(CONFIG_DIR)libs-armv5te-android-gcc.mk
+  LOCAL_ARM_MODE := arm
+else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
+  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+  LOCAL_ARM_MODE := arm
+else ifeq ($(TARGET_ARCH_ABI),x86)
+  include $(CONFIG_DIR)libs-x86-android-gcc.mk
+else ifeq ($(TARGET_ARCH_ABI),mips)
+  include $(CONFIG_DIR)libs-mips-android-gcc.mk
+else
+  $(error Not a supported TARGET_ARCH_ABI: $(TARGET_ARCH_ABI))
 endif
 
 # Rule that is normally in Makefile created by libvpx
@@ -72,10 +83,13 @@
 # Include the list of files to be built
 include $(LIBVPX_PATH)/libs.mk
 
-# Want arm, not thumb, optimized
-LOCAL_ARM_MODE := arm
+# Optimise the code. May want to revisit this setting in the future.
 LOCAL_CFLAGS := -O3
 
+# For x86, include the source code in the search path so it will find files
+# like x86inc.asm and x86_abi_support.asm
+LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
+
 # -----------------------------------------------------------------------------
 # Template  : asm_offsets_template
 # Arguments : 1: assembly offsets file to be created
@@ -109,13 +123,13 @@
 	@grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@
 endef
 
-# Use ads2gas script to convert from RVCT format to GAS format.  This passes
+# Use ads2gas script to convert from RVCT format to GAS format.  This
 #  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
 #  to handle removing these
 ifeq ($(CONFIG_VP8_ENCODER), yes)
   ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
 endif
-ifeq ($(HAVE_NEON), yes)
+ifeq ($(HAVE_NEON_ASM), yes)
   ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm
 endif
 
@@ -142,22 +156,34 @@
 LOCAL_CODEC_SRCS_C = $(filter-out vpx_config.c %_neon.c, $(CODEC_SRCS_C))
 
 LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libvpx/$(file))
-LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file).neon)
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file).neon)
+else # If there are neon sources then we are building for arm64 and do not need to specify .neon
+  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file))
+endif
 
 # Pull out assembly files, splitting NEON from the rest.  This is
 # done to specify that the NEON assembly files use NEON assembler flags.
-CODEC_SRCS_ASM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
-CODEC_SRCS_ASM = $(foreach v, \
-                 $(CODEC_SRCS_ASM_ALL), \
-                 $(if $(findstring neon,$(v)),,$(v)))
+# x86 assembly matches %.asm, arm matches %.asm.s
+
+# x86:
+
+CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
+LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
+
+# arm:
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM = $(foreach v, \
+                     $(CODEC_SRCS_ASM_ARM_ALL), \
+                     $(if $(findstring neon,$(v)),,$(v)))
 CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
                          $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
-                         $(CODEC_SRCS_ASM))
+                         $(CODEC_SRCS_ASM_ARM))
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
 
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   CODEC_SRCS_ASM_NEON = $(foreach v, \
-                        $(CODEC_SRCS_ASM_ALL),\
+                        $(CODEC_SRCS_ASM_ARM_ALL),\
                         $(if $(findstring neon,$(v)),$(v),))
   CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
                                 $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
@@ -189,6 +215,10 @@
 endif
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
 
+ifeq ($(TARGET_ARCH_ABI),x86)
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
+endif
+
 .PHONY: clean
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
diff --git a/build/make/Makefile b/build/make/Makefile
index 03dacce..c4d53f1 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -411,6 +411,7 @@
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
     DIST-SRCS-$(CONFIG_MSVS)  += build/arm-msvs/obj_int_extract.bat
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
diff --git a/build/make/configure.sh b/build/make/configure.sh
index a65d395..a2a64c2 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -792,8 +792,12 @@
     arm*)
         # on arm, isa versions are supersets
         case ${tgt_isa} in
+        armv8)
+            soft_enable neon
+            ;;
         armv7)
             soft_enable neon
+            soft_enable neon_asm
             soft_enable media
             soft_enable edsp
             soft_enable fast_unaligned
@@ -859,6 +863,13 @@
             msvs_arch_dir=arm-msvs
             disable_feature multithread
             disable_feature unit_tests
+            vs_version=${tgt_cc##vs}
+            if [ $vs_version -ge 12 ]; then
+                # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
+                # only "AppContainerApplication" which requires an AppxManifest.
+                # Therefore disable the examples, just build the library.
+                disable_feature examples
+            fi
             ;;
         rvct)
             CC=armcc
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index d0cbf3e..4e803b8 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -9,17 +9,11 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-
 self=$0
 self_basename=${self##*/}
 self_dirname=$(dirname "$0")
-EOL=$'\n'
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
-   && cygpath --help >/dev/null 2>&1; then
-    FIXPATH='cygpath -m'
-else
-    FIXPATH='echo'
-fi
+
+. "$self_dirname/msvs_common.sh"|| exit 127
 
 show_help() {
     cat <<EOF
@@ -49,86 +43,6 @@
     exit 1
 }
 
-die() {
-    echo "${self_basename}: $@" >&2
-    exit 1
-}
-
-die_unknown(){
-    echo "Unknown option \"$1\"." >&2
-    echo "See ${self_basename} --help for available options." >&2
-    exit 1
-}
-
-fix_path() {
-    $FIXPATH "$1"
-}
-
-generate_uuid() {
-    local hex="0123456789ABCDEF"
-    local i
-    local uuid=""
-    local j
-    #93995380-89BD-4b04-88EB-625FBE52EBFB
-    for ((i=0; i<32; i++)); do
-        (( j = $RANDOM % 16 ))
-        uuid="${uuid}${hex:$j:1}"
-    done
-    echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}"
-}
-
-indent1="    "
-indent=""
-indent_push() {
-    indent="${indent}${indent1}"
-}
-indent_pop() {
-    indent="${indent%${indent1}}"
-}
-
-tag_attributes() {
-    for opt in "$@"; do
-        optval="${opt#*=}"
-        [ -n "${optval}" ] ||
-            die "Missing attribute value in '$opt' while generating $tag tag"
-        echo "${indent}${opt%%=*}=\"${optval}\""
-    done
-}
-
-open_tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        echo "${indent}>"
-    else
-        echo "${indent}<${tag}>"
-        indent_push
-    fi
-}
-
-close_tag() {
-    local tag=$1
-    indent_pop
-    echo "${indent}</${tag}>"
-}
-
-tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        indent_pop
-        echo "${indent}/>"
-    else
-        echo "${indent}<${tag}/>"
-    fi
-}
-
 generate_filter() {
     local var=$1
     local name=$2
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index a64e129..8529eed 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -9,17 +9,11 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-
 self=$0
 self_basename=${self##*/}
 self_dirname=$(dirname "$0")
-EOL=$'\n'
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
-   && cygpath --help >/dev/null 2>&1; then
-    FIXPATH='cygpath -m'
-else
-    FIXPATH='echo'
-fi
+
+. "$self_dirname/msvs_common.sh"|| exit 127
 
 show_help() {
     cat <<EOF
@@ -50,86 +44,6 @@
     exit 1
 }
 
-die() {
-    echo "${self_basename}: $@" >&2
-    exit 1
-}
-
-die_unknown(){
-    echo "Unknown option \"$1\"." >&2
-    echo "See ${self_basename} --help for available options." >&2
-    exit 1
-}
-
-fix_path() {
-    $FIXPATH "$1"
-}
-
-generate_uuid() {
-    local hex="0123456789ABCDEF"
-    local i
-    local uuid=""
-    local j
-    #93995380-89BD-4b04-88EB-625FBE52EBFB
-    for ((i=0; i<32; i++)); do
-        (( j = $RANDOM % 16 ))
-        uuid="${uuid}${hex:$j:1}"
-    done
-    echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}"
-}
-
-indent1="    "
-indent=""
-indent_push() {
-    indent="${indent}${indent1}"
-}
-indent_pop() {
-    indent="${indent%${indent1}}"
-}
-
-tag_attributes() {
-    for opt in "$@"; do
-        optval="${opt#*=}"
-        [ -n "${optval}" ] ||
-            die "Missing attribute value in '$opt' while generating $tag tag"
-        echo "${indent}${opt%%=*}=\"${optval}\""
-    done
-}
-
-open_tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        echo "${indent}>"
-    else
-        echo "${indent}<${tag}>"
-        indent_push
-    fi
-}
-
-close_tag() {
-    local tag=$1
-    indent_pop
-    echo "${indent}</${tag}>"
-}
-
-tag() {
-    local tag=$1
-    shift
-    if [ $# -ne 0 ]; then
-        echo "${indent}<${tag}"
-        indent_push
-        tag_attributes "$@"
-        indent_pop
-        echo "${indent}/>"
-    else
-        echo "${indent}<${tag}/>"
-    fi
-}
-
 tag_content() {
     local tag=$1
     local content=$2
@@ -378,6 +292,18 @@
         tag_content ProjectGuid "{${guid}}"
         tag_content RootNamespace ${name}
         tag_content Keyword ManagedCProj
+        if [ $vs_ver -ge 12 ] && [ "${platforms[0]}" = "ARM" ]; then
+            tag_content AppContainerApplication true
+            # The application type can be one of "Windows Store",
+            # "Windows Phone" or "Windows Phone Silverlight". The
+            # actual value doesn't matter from the libvpx point of view,
+            # since a static library built for one works on the others.
+            # The PlatformToolset field needs to be set in sync with this;
+            # for Windows Store and Windows Phone Silverlight it should be
+            # v120 while it should be v120_wp81 if the type is Windows Phone.
+            tag_content ApplicationType "Windows Store"
+            tag_content ApplicationTypeRevision 8.1
+        fi
     close_tag PropertyGroup
 
     tag Import \
@@ -410,18 +336,10 @@
                 fi
             fi
             if [ "$vs_ver" = "12" ]; then
-                if [ "$plat" = "ARM" ]; then
-                    # Setting the wp80 toolchain automatically sets the
-                    # WINAPI_FAMILY define, which is required for building
-                    # code for arm with the windows headers. Alternatively,
-                    # one could add AppContainerApplication=true in the Globals
-                    # section and add PrecompiledHeader=NotUsing and
-                    # CompileAsWinRT=false in ClCompile and SubSystem=Console
-                    # in Link.
-                    tag_content PlatformToolset v120_wp80
-                else
-                    tag_content PlatformToolset v120
-                fi
+                # Setting a PlatformToolset indicating windows phone isn't
+                # enough to build code for arm with MSVC 2013, one strictly
+                # has to enable AppContainerApplication as well.
+                tag_content PlatformToolset v120
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
@@ -513,15 +431,25 @@
             if ${werror:-false}; then
                 tag_content TreatWarningAsError true
             fi
+            if [ $vs_ver -ge 11 ]; then
+                # We need to override the defaults for these settings
+                # if AppContainerApplication is set.
+                tag_content CompileAsWinRT false
+                tag_content PrecompiledHeader NotUsing
+                tag_content SDLCheck false
+            fi
             close_tag ClCompile
             case "$proj_kind" in
             exe)
                 open_tag Link
                 if [ "$name" != "obj_int_extract" ]; then
-                    tag_content AdditionalDependencies "$curlibs"
+                    tag_content AdditionalDependencies "$curlibs;%(AdditionalDependencies)"
                     tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)"
                 fi
                 tag_content GenerateDebugInformation true
+                # Console is the default normally, but if
+                # AppContainerApplication is set, we need to override it.
+                tag_content SubSystem Console
                 close_tag Link
                 ;;
             dll)
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
new file mode 100644
index 0000000..eb2eb7b
--- /dev/null
+++ b/build/make/msvs_common.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+   && cygpath --help >/dev/null 2>&1; then
+    FIXPATH='cygpath -m'
+else
+    FIXPATH='echo'
+fi
+
+die() {
+    echo "${self_basename}: $@" >&2
+    exit 1
+}
+
+die_unknown(){
+    echo "Unknown option \"$1\"." >&2
+    echo "See ${self_basename} --help for available options." >&2
+    exit 1
+}
+
+fix_path() {
+    $FIXPATH "$1"
+}
+
+generate_uuid() {
+    local hex="0123456789ABCDEF"
+    local i
+    local uuid=""
+    local j
+    #93995380-89BD-4b04-88EB-625FBE52EBFB
+    for ((i=0; i<32; i++)); do
+        (( j = $RANDOM % 16 ))
+        uuid="${uuid}${hex:$j:1}"
+    done
+    echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}"
+}
+
+indent1="    "
+indent=""
+indent_push() {
+    indent="${indent}${indent1}"
+}
+indent_pop() {
+    indent="${indent%${indent1}}"
+}
+
+tag_attributes() {
+    for opt in "$@"; do
+        optval="${opt#*=}"
+        [ -n "${optval}" ] ||
+            die "Missing attribute value in '$opt' while generating $tag tag"
+        echo "${indent}${opt%%=*}=\"${optval}\""
+    done
+}
+
+open_tag() {
+    local tag=$1
+    shift
+    if [ $# -ne 0 ]; then
+        echo "${indent}<${tag}"
+        indent_push
+        tag_attributes "$@"
+        echo "${indent}>"
+    else
+        echo "${indent}<${tag}>"
+        indent_push
+    fi
+}
+
+close_tag() {
+    local tag=$1
+    indent_pop
+    echo "${indent}</${tag}>"
+}
+
+tag() {
+    local tag=$1
+    shift
+    if [ $# -ne 0 ]; then
+        echo "${indent}<${tag}"
+        indent_push
+        tag_attributes "$@"
+        indent_pop
+        echo "${indent}/>"
+    else
+        echo "${indent}<${tag}/>"
+    fi
+}
+
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 18ee80d..f5f59b1 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -272,6 +272,9 @@
   # Assign the helper variable for each enabled extension
   foreach my $opt (@ALL_ARCHS) {
     my $opt_uc = uc $opt;
+    # Enable neon assembly based on HAVE_NEON logic instead of adding new
+    # HAVE_NEON_ASM logic
+    if ($opt eq 'neon_asm') { $opt_uc = 'NEON' }
     eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
   }
 
@@ -381,7 +384,10 @@
   @ALL_ARCHS = filter(qw/edsp media/);
   arm;
 } elsif ($opts{arch} eq 'armv7') {
-  @ALL_ARCHS = filter(qw/edsp media neon/);
+  @ALL_ARCHS = filter(qw/edsp media neon_asm neon/);
+  arm;
+} elsif ($opts{arch} eq 'armv8') {
+  @ALL_ARCHS = filter(qw/neon/);
   arm;
 } else {
   unoptimized;
diff --git a/configure b/configure
index 29683cd..bd95056 100755
--- a/configure
+++ b/configure
@@ -189,7 +189,7 @@
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
-enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs
+enabled doxygen && enable_feature install_docs
 enable_feature install_bins
 enable_feature install_libs
 
@@ -239,6 +239,7 @@
     edsp
     media
     neon
+    neon_asm
 
     mips32
     dspr2
@@ -709,6 +710,7 @@
             soft_enable webm_io
         ;;
         *-android-*)
+            soft_enable webm_io
             # GTestLog must be modified to use Android logging utilities.
         ;;
         *-darwin-*)
diff --git a/docs.mk b/docs.mk
index 797b466..889d182 100644
--- a/docs.mk
+++ b/docs.mk
@@ -23,12 +23,6 @@
 # Other doxy files sourced in Markdown
 TXT_DOX = $(call enabled,TXT_DOX)
 
-%.dox: %.txt
-	@echo "    [DOXY] $@"
-	@$(SRC_PATH_BARE)/examples/gen_example_doxy.php \
-             $(@:.dox=)  "$($@.DESC)" > $@ < $<
-
-
 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
 EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
 
diff --git a/examples.mk b/examples.mk
index 91b9801..c36159f 100644
--- a/examples.mk
+++ b/examples.mk
@@ -25,6 +25,11 @@
                       third_party/libwebm/mkvwriter.hpp \
                       third_party/libwebm/webmids.hpp
 
+LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
+                      third_party/libwebm/mkvreader.cpp \
+                      third_party/libwebm/mkvparser.hpp \
+                      third_party/libwebm/mkvreader.hpp
+
 # List of examples to build. UTILS are tools meant for distribution
 # while EXAMPLES demonstrate specific portions of the API.
 UTILS-$(CONFIG_DECODERS)    += vpxdec.c
@@ -39,14 +44,8 @@
 vpxdec.SRCS                 += y4menc.c y4menc.h
 vpxdec.SRCS                 += $(LIBYUV_SRCS)
 ifeq ($(CONFIG_WEBM_IO),yes)
-  vpxdec.SRCS                 += third_party/nestegg/halloc/halloc.h
-  vpxdec.SRCS                 += third_party/nestegg/halloc/src/align.h
-  vpxdec.SRCS                 += third_party/nestegg/halloc/src/halloc.c
-  vpxdec.SRCS                 += third_party/nestegg/halloc/src/hlist.h
-  vpxdec.SRCS                 += third_party/nestegg/halloc/src/macros.h
-  vpxdec.SRCS                 += third_party/nestegg/include/nestegg/nestegg.h
-  vpxdec.SRCS                 += third_party/nestegg/src/nestegg.c
-  vpxdec.SRCS                 += webmdec.c webmdec.h
+  vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
+  vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
@@ -82,11 +81,6 @@
 EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
 endif
 
-# XMA example disabled for now, not used in VP8
-#UTILS-$(CONFIG_DECODERS)    += example_xma.c
-#example_xma.GUID             = A955FC4A-73F1-44F7-135E-30D84D32F022
-#example_xma.DESCRIPTION      = External Memory Allocation mode usage
-
 EXAMPLES-$(CONFIG_ENCODERS)         += vpx_temporal_scalable_patterns.c
 vpx_temporal_scalable_patterns.SRCS += ivfenc.c ivfenc.h
 vpx_temporal_scalable_patterns.SRCS += tools_common.c tools_common.h
@@ -147,11 +141,6 @@
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
-ifeq ($(CONFIG_VP8_DECODER),yes)
-EXAMPLES-$(CONFIG_ERROR_CONCEALMENT)    += decode_with_partial_drops.c
-endif
-decode_with_partial_drops.GUID           = 61C2D026-5754-46AC-916F-1343ECC5537E
-decode_with_partial_drops.DESCRIPTION    = Drops parts of frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
 set_maps.SRCS                      += ivfenc.h ivfenc.c
 set_maps.SRCS                      += tools_common.h tools_common.c
diff --git a/examples/decode_with_partial_drops.c b/examples/decode_with_partial_drops.c
deleted file mode 100644
index d7132de..0000000
--- a/examples/decode_with_partial_drops.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Decode With Partial Drops Example
-// =========================
-//
-// This is an example utility which drops a series of frames (or parts of
-// frames), as specified on the command line. This is useful for observing the
-// error recovery features of the codec.
-//
-// Usage
-// -----
-// This example adds a single argument to the `simple_decoder` example,
-// which specifies the range or pattern of frames to drop. The parameter is
-// parsed as follows.
-//
-// Dropping A Range Of Frames
-// --------------------------
-// To drop a range of frames, specify the starting frame and the ending
-// frame to drop, separated by a dash. The following command will drop
-// frames 5 through 10 (base 1).
-//
-//  $ ./decode_with_partial_drops in.ivf out.i420 5-10
-//
-//
-// Dropping A Pattern Of Frames
-// ----------------------------
-// To drop a pattern of frames, specify the number of frames to drop and
-// the number of frames after which to repeat the pattern, separated by
-// a forward-slash. The following command will drop 3 of 7 frames.
-// Specifically, it will decode 4 frames, then drop 3 frames, and then
-// repeat.
-//
-//  $ ./decode_with_partial_drops in.ivf out.i420 3/7
-//
-// Dropping Random Parts Of Frames
-// -------------------------------
-// A third argument tuple is available to split the frame into 1500 bytes pieces
-// and randomly drop pieces rather than frames. The frame will be split at
-// partition boundaries where possible. The following example will seed the RNG
-// with the seed 123 and drop approximately 5% of the pieces. Pieces which
-// are depending on an already dropped piece will also be dropped.
-//
-//  $ ./decode_with_partial_drops in.ivf out.i420 5,123
-//
-// Extra Variables
-// ---------------
-// This example maintains the pattern passed on the command line in the
-// `n`, `m`, and `is_range` variables:
-//
-// Making The Drop Decision
-// ------------------------
-// The example decides whether to drop the frame based on the current
-// frame number, immediately before decoding the frame.
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
-#include "vpx/vp8dx.h"
-#include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
-#include <time.h>
-
-
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
-
-static unsigned int mem_get_le32(const unsigned char *mem) {
-    return (mem[3] << 24)|(mem[2] << 16)|(mem[1] << 8)|(mem[0]);
-}
-
-static void die(const char *fmt, ...) {
-    va_list ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
-
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
-}
-
-struct parsed_header
-{
-    char key_frame;
-    int version;
-    char show_frame;
-    int first_part_size;
-};
-
-int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
-{
-    int size = 0;
-    int remaining = length - pos;
-    /* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
-    int uncomp_part_size = (hdr->key_frame ? 10 : 3);
-    /* number of bytes yet to send from header and the first partition */
-    int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
-    if (remainFirst > 0)
-    {
-        if (remainFirst <= mtu)
-        {
-            size = remainFirst;
-        }
-        else
-        {
-            size = mtu;
-        }
-
-        return size;
-    }
-
-    /* second partition; just slot it up according to MTU */
-    if (remaining <= mtu)
-    {
-        size = remaining;
-        return size;
-    }
-    return mtu;
-}
-
-void throw_packets(unsigned char* frame, int* size, int loss_rate,
-                   int* thrown, int* kept)
-{
-    unsigned char loss_frame[256*1024];
-    int pkg_size = 1;
-    int pos = 0;
-    int loss_pos = 0;
-    struct parsed_header hdr;
-    unsigned int tmp;
-    int mtu = 1500;
-
-    if (*size < 3)
-    {
-        return;
-    }
-    putc('|', stdout);
-    /* parse uncompressed 3 bytes */
-    tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
-    hdr.key_frame = !(tmp & 0x1); /* inverse logic */
-    hdr.version = (tmp >> 1) & 0x7;
-    hdr.show_frame = (tmp >> 4) & 0x1;
-    hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
-
-    /* don't drop key frames */
-    if (hdr.key_frame)
-    {
-        int i;
-        *kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
-        for (i=0; i < *kept; i++)
-            putc('.', stdout);
-        return;
-    }
-
-    while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
-    {
-        int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
-        if (*thrown == 0 && !loss_event)
-        {
-            memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
-            loss_pos += pkg_size;
-            (*kept)++;
-            putc('.', stdout);
-        }
-        else
-        {
-            (*thrown)++;
-            putc('X', stdout);
-        }
-        pos += pkg_size;
-    }
-    memcpy(frame, loss_frame, loss_pos);
-    memset(frame + loss_pos, 0, *size - loss_pos);
-    *size = loss_pos;
-}
-
-int main(int argc, char **argv) {
-    FILE            *infile, *outfile;
-    vpx_codec_ctx_t  codec;
-    int              flags = 0, frame_cnt = 0;
-    unsigned char    file_hdr[IVF_FILE_HDR_SZ];
-    unsigned char    frame_hdr[IVF_FRAME_HDR_SZ];
-    unsigned char    frame[256*1024];
-    vpx_codec_err_t  res;
-    int              n, m, mode;
-    unsigned int     seed;
-    int              thrown=0, kept=0;
-    int              thrown_frame=0, kept_frame=0;
-    vpx_codec_dec_cfg_t  dec_cfg = {0};
-
-    (void)res;
-    /* Open files */
-    if(argc < 4 || argc > 6)
-        die("Usage: %s <infile> <outfile> [-t <num threads>] <N-M|N/M|L,S>\n",
-            argv[0]);
-    {
-        char *nptr;
-        int arg_num = 3;
-        if (argc == 6 && strncmp(argv[arg_num++], "-t", 2) == 0)
-            dec_cfg.threads = strtol(argv[arg_num++], NULL, 0);
-        n = strtol(argv[arg_num], &nptr, 0);
-        mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
-
-        m = strtol(nptr+1, NULL, 0);
-        if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
-            *nptr != '\0' && *nptr != ','))
-            die("Couldn't parse pattern %s\n", argv[3]);
-    }
-    seed = (m > 0) ? m : (unsigned int)time(NULL);
-    srand(seed);thrown_frame = 0;
-    printf("Seed: %u\n", seed);
-    printf("Threads: %d\n", dec_cfg.threads);
-    if(!(infile = fopen(argv[1], "rb")))
-        die("Failed to open %s for reading", argv[1]);
-    if(!(outfile = fopen(argv[2], "wb")))
-        die("Failed to open %s for writing", argv[2]);
-
-    /* Read file header */
-    if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
-         && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
-         && file_hdr[3]=='F'))
-        die("%s is not an IVF file.", argv[1]);
-
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-    /* Initialize codec */
-    flags = VPX_CODEC_USE_ERROR_CONCEALMENT;
-    res = vpx_codec_dec_init(&codec, interface, &dec_cfg, flags);
-    if(res)
-        die_codec(&codec, "Failed to initialize decoder");
-
-
-    /* Read each frame */
-    while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
-        int               frame_sz = mem_get_le32(frame_hdr);
-        vpx_codec_iter_t  iter = NULL;
-        vpx_image_t      *img;
-
-
-        frame_cnt++;
-        if(frame_sz > sizeof(frame))
-            die("Frame %d data too big for example code buffer", frame_sz);
-        if(fread(frame, 1, frame_sz, infile) != frame_sz)
-            die("Frame %d failed to read complete frame", frame_cnt);
-
-        /* Decide whether to throw parts of the frame or the whole frame
-           depending on the drop mode */
-        thrown_frame = 0;
-        kept_frame = 0;
-        switch (mode)
-        {
-        case 0:
-            if (m - (frame_cnt-1)%m <= n)
-            {
-                frame_sz = 0;
-            }
-            break;
-        case 1:
-            if (frame_cnt >= n && frame_cnt <= m)
-            {
-                frame_sz = 0;
-            }
-            break;
-        case 2:
-            throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
-            break;
-        default: break;
-        }
-        if (mode < 2)
-        {
-            if (frame_sz == 0)
-            {
-                putc('X', stdout);
-                thrown_frame++;
-            }
-            else
-            {
-                putc('.', stdout);
-                kept_frame++;
-            }
-        }
-        thrown += thrown_frame;
-        kept += kept_frame;
-        fflush(stdout);
-        /* Decode the frame */
-        if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 0))
-            die_codec(&codec, "Failed to decode frame");
-
-        /* Write decoded data to disk */
-        while((img = vpx_codec_get_frame(&codec, &iter))) {
-            unsigned int plane, y;
-
-            for(plane=0; plane < 3; plane++) {
-                unsigned char *buf =img->planes[plane];
-            
-                for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-                    (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-                                  outfile);
-                    buf += img->stride[plane];
-                }
-            }
-        }
-    }
-    printf("Processed %d frames.\n",frame_cnt);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    fclose(outfile);
-    fclose(infile);
-    return EXIT_SUCCESS;
-}
diff --git a/examples/example_xma.c b/examples/example_xma.c
deleted file mode 100644
index c960c28..0000000
--- a/examples/example_xma.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This is a simple program showing how to initialize the decoder in XMA mode */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx_config.h"
-#include "vpx/vpx_decoder.h"
-#include "vpx/vpx_integer.h"
-#if CONFIG_VP9_DECODER
-#include "vpx/vp8dx.h"
-#endif
-
-static char *exec_name;
-static int   verbose = 0;
-
-static const struct {
-  const char *name;
-  vpx_codec_iface_t *iface;
-} ifaces[] = {
-#if CONFIG_VP9_DECODER
-  {"vp9",  &vpx_codec_vp8_dx_algo},
-#endif
-};
-
-static void usage_exit(void) {
-  int i;
-
-  printf("Usage: %s <options>\n\n"
-         "Options:\n"
-         "\t--codec <name>\tCodec to use (default=%s)\n"
-         "\t-h <height>\tHeight of the simulated video frame, in pixels\n"
-         "\t-w <width> \tWidth of the simulated video frame, in pixels\n"
-         "\t-v         \tVerbose mode (show individual segment sizes)\n"
-         "\t--help     \tShow this message\n"
-         "\n"
-         "Included decoders:\n"
-         "\n",
-         exec_name,
-         ifaces[0].name);
-
-  for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    printf("    %-6s - %s\n",
-           ifaces[i].name,
-           vpx_codec_iface_name(ifaces[i].iface));
-
-  exit(EXIT_FAILURE);
-}
-
-static void usage_error(const char *fmt, ...) {
-  va_list ap;
-  va_start(ap, fmt);
-  vprintf(fmt, ap);
-  printf("\n");
-  usage_exit();
-}
-
-void my_mem_dtor(vpx_codec_mmap_t *mmap) {
-  if (verbose)
-    printf("freeing segment %d\n", mmap->id);
-
-  free(mmap->priv);
-}
-
-int main(int argc, char **argv) {
-  vpx_codec_ctx_t           decoder;
-  vpx_codec_iface_t        *iface = ifaces[0].iface;
-  vpx_codec_iter_t          iter;
-  vpx_codec_dec_cfg_t       cfg;
-  vpx_codec_err_t           res = VPX_CODEC_OK;
-  unsigned int            alloc_sz = 0;
-  unsigned int            w = 352;
-  unsigned int            h = 288;
-  int                     i;
-
-  exec_name = argv[0];
-
-  for (i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--codec")) {
-      if (i + 1 < argc) {
-        int j, k = -1;
-
-        i++;
-
-        for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
-          if (!strcmp(ifaces[j].name, argv[i]))
-            k = j;
-
-        if (k >= 0)
-          iface = ifaces[k].iface;
-        else
-          usage_error("Error: Unrecognized argument (%s) to --codec\n",
-                      argv[i]);
-      } else
-        usage_error("Error: Option --codec requires argument.\n");
-    } else if (!strcmp(argv[i], "-v"))
-      verbose = 1;
-    else if (!strcmp(argv[i], "-h"))
-      if (i + 1 < argc) {
-        h = atoi(argv[++i]);
-      } else
-        usage_error("Error: Option -h requires argument.\n");
-    else if (!strcmp(argv[i], "-w"))
-      if (i + 1 < argc) {
-        w = atoi(argv[++i]);
-      } else
-        usage_error("Error: Option -w requires argument.\n");
-    else if (!strcmp(argv[i], "--help"))
-      usage_exit();
-    else
-      usage_error("Error: Unrecognized option %s\n\n", argv[i]);
-  }
-
-  if (argc == 1)
-    printf("Using built-in defaults. For options, rerun with --help\n\n");
-
-  /* XMA mode is not supported on all decoders! */
-  if (!(vpx_codec_get_caps(iface) & VPX_CODEC_CAP_XMA)) {
-    printf("%s does not support XMA mode!\n", vpx_codec_iface_name(iface));
-    return EXIT_FAILURE;
-  }
-
-  /* The codec knows how much memory to allocate based on the size of the
-   * encoded frames. This data can be parsed from the bitstream with
-   * vpx_codec_peek_stream_info() if a bitstream is available. Otherwise,
-   * a fixed size can be used that will be the upper limit on the frame
-   * size the decoder can decode.
-   */
-  cfg.w = w;
-  cfg.h = h;
-
-  /* Initialize the decoder in XMA mode. */
-  if (vpx_codec_dec_init(&decoder, iface, &cfg, VPX_CODEC_USE_XMA)) {
-    printf("Failed to initialize decoder in XMA mode: %s\n",
-           vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
-
-  /* Iterate through the list of memory maps, allocating them with the
-   * requested alignment.
-   */
-  iter = NULL;
-
-  do {
-    vpx_codec_mmap_t  mmap;
-    unsigned int    align;
-
-    res = vpx_codec_get_mem_map(&decoder, &mmap, &iter);
-    align = mmap.align ? mmap.align - 1 : 0;
-
-    if (!res) {
-      if (verbose)
-        printf("Allocating segment %u, size %lu, align %u %s\n",
-               mmap.id, mmap.sz, mmap.align,
-               mmap.flags & VPX_CODEC_MEM_ZERO ? "(ZEROED)" : "");
-
-      if (mmap.flags & VPX_CODEC_MEM_ZERO)
-        mmap.priv = calloc(1, mmap.sz + align);
-      else
-        mmap.priv = malloc(mmap.sz + align);
-
-      mmap.base = (void *)((((uintptr_t)mmap.priv) + align) &
-                  ~(uintptr_t)align);
-      mmap.dtor = my_mem_dtor;
-      alloc_sz += mmap.sz + align;
-
-      if (vpx_codec_set_mem_map(&decoder, &mmap, 1)) {
-        printf("Failed to set mmap: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-      }
-    } else if (res != VPX_CODEC_LIST_END) {
-      printf("Failed to get mmap: %s\n", vpx_codec_error(&decoder));
-      return EXIT_FAILURE;
-    }
-  } while (res != VPX_CODEC_LIST_END);
-
-  printf("%s\n    %d bytes external memory required for %dx%d.\n",
-         decoder.name, alloc_sz, cfg.w, cfg.h);
-  vpx_codec_destroy(&decoder);
-  return EXIT_SUCCESS;
-
-}
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c
index 64e62ef..983f52d 100644
--- a/examples/vp9_spatial_scalable_encoder.c
+++ b/examples/vp9_spatial_scalable_encoder.c
@@ -344,7 +344,7 @@
     }
 
     res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
-                         pts, frame_duration, VPX_DL_REALTIME);
+                         pts, frame_duration, VPX_DL_GOOD_QUALITY);
     printf("%s", vpx_svc_get_message(&svc_ctx));
     if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
diff --git a/test/android/Android.mk b/test/android/Android.mk
index 13af601..4e750b2 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -14,8 +14,14 @@
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
 
+#libwebm
+include $(CLEAR_VARS)
+include $(BINDINGS_DIR)/libvpx/third_party/libwebm/Android.mk
+LOCAL_PATH := $(CUR_WD)/../../..
+
 #libvpx
 include $(CLEAR_VARS)
+LOCAL_STATIC_LIBRARIES := libwebm
 include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
 LOCAL_PATH := $(CUR_WD)/../..
 
@@ -33,7 +39,7 @@
 include $(CLEAR_VARS)
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libvpx_test
-LOCAL_STATIC_LIBRARIES := gtest
+LOCAL_STATIC_LIBRARIES := gtest libwebm
 LOCAL_SHARED_LIBRARIES := vpx
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
diff --git a/test/android/README b/test/android/README
index 6840d91..4a1adcf 100644
--- a/test/android/README
+++ b/test/android/README
@@ -3,7 +3,7 @@
 ./libvpx/configure --target=armv7-android-gcc --enable-external-build \
   --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
   --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
-  --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
+  --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK
 
 2) From the parent directory, invoke ndk-build:
 NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
diff --git a/test/borders_test.cc b/test/borders_test.cc
index a2f5a1b..b30be45 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -35,6 +35,7 @@
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
       encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
     }
   }
 
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 37ee0ef..cbb4036 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -634,7 +634,7 @@
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
     vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
     vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index ca201bb..be651b4 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -37,6 +37,7 @@
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
       encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
     }
   }
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 2b4aa3a..80be05e 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -522,10 +522,14 @@
     cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+      // TODO(yaowu): Work out more stable rc control strategy and
+      //              Adjust the thresholds to be tighter than .75.
+      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
           << " The datarate for the file is lower than target by too much, "
               "for layer: " << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+      // TODO(yaowu): Work out more stable rc control strategy and
+      //              Adjust the thresholds to be tighter than 1.25.
+      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
           << " The datarate for the file is greater than target by too much, "
               "for layer: " << j;
     }
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index cb5562e..143a267 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -512,7 +512,7 @@
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 013f451..72c0bd6 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -248,7 +248,7 @@
         make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
         make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index da1a870..f64acc8 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -24,22 +24,25 @@
   fi
 }
 
-# Runs decode_to_md5 on $1 and echoes the MD5 sum for the final frame. $2 is
-# interpreted as codec name and used solely to name the output file.
+# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is
+# interpreted as codec name and used solely to name the output file. $3 is the
+# expected md5 sum: It must match that of the final frame.
 decode_to_md5() {
   local decoder="${LIBVPX_BIN_PATH}/decode_to_md5${VPX_TEST_EXE_SUFFIX}"
   local input_file="$1"
   local codec="$2"
+  local expected_md5="$3"
   local output_file="${VPX_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"
 
   [ -x "${decoder}" ] || return 1
 
-  "${decoder}" "${input_file}" "${output_file}" > /dev/null 2>&1
+  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 
   local md5_last_frame=$(tail -n1 "${output_file}")
-  echo "${md5_last_frame% *}" | tr -d [:space:]
+  local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:])
+  [ "${actual_md5}" = "${expected_md5}" ] || return 1
 }
 
 decode_to_md5_vp8() {
@@ -47,8 +50,7 @@
   local expected_md5="56794d911b02190212bca92f88ad60c6"
 
   if [ "$(vp8_decode_available)" = "yes" ]; then
-    local actual_md5="$(decode_to_md5 "${VP8_IVF_FILE}" vp8)" || return 1
-    [ "${actual_md5}" = "${expected_md5}" ] || return 1
+    decode_to_md5 "${VP8_IVF_FILE}" "vp8" "${expected_md5}"
   fi
 }
 
@@ -57,8 +59,7 @@
   local expected_md5="2952c0eae93f3dadd1aa84c50d3fd6d2"
 
   if [ "$(vp9_decode_available)" = "yes" ]; then
-    local actual_md5="$(decode_to_md5 "${VP9_IVF_FILE}" vp9)" || return 1
-    [ "${actual_md5}" = "${expected_md5}" ] || return 1
+    decode_to_md5 "${VP9_IVF_FILE}" "vp9" "${expected_md5}"
   fi
 }
 
diff --git a/test/decode_with_drops.sh b/test/decode_with_drops.sh
new file mode 100755
index 0000000..82e934d
--- /dev/null
+++ b/test/decode_with_drops.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx decode_with_drops example. To add new tests to
+##  this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to decode_with_drops_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+decode_with_drops_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely
+# to name the output file. $3 is the drop mode, and is passed directly to
+# decode_with_drops.
+decode_with_drops() {
+  local decoder="${LIBVPX_BIN_PATH}/decode_with_drops${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
+  local drop_mode="$3"
+
+  [ -x "${decoder}" ] || return 1
+
+  eval "${decoder}" "${input_file}" "${output_file}" "${drop_mode}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+# Decodes $VP8_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+# Note: This test assumes that $VP8_IVF_FILE has exactly 29 frames, and could
+# break if the file is modified.
+decode_with_drops_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    # Test sequence mode: Drop frames 2-28.
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4"
+  fi
+}
+
+# Decodes $VP9_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+# Note: This test assumes that $VP9_IVF_FILE has exactly 20 frames, and could
+# break if the file is modified.
+decode_with_drops_vp9() {
+  if [ "$(vp9_decode_available)" = "yes" ]; then
+    # Test sequence mode: Drop frames 2-28.
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4"
+  fi
+}
+
+decode_with_drops_tests="decode_with_drops_vp8
+                         decode_with_drops_vp9"
+
+run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/test/examples.sh b/test/examples.sh
new file mode 100755
index 0000000..ac2a18c
--- /dev/null
+++ b/test/examples.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file runs all of the tests for the libvpx examples.
+##
+. $(dirname $0)/tools_common.sh
+
+example_tests=$(ls $(dirname $0)/*.sh)
+
+# List of script names to exclude.
+exclude_list="examples vpxdec vpxenc tools_common"
+
+# Filter out the scripts in $exclude_list.
+for word in ${exclude_list}; do
+  example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
+done
+
+for test in ${example_tests}; do
+  # Source each test script so that exporting variables can be avoided.
+  . "${test}"
+done
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 127775c..030665e 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -48,6 +48,10 @@
   vp9_fht4x4_c(in, out, stride, tx_type);
 }
 
+void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+  vp9_fwht4x4_c(in, out, stride);
+}
+
 class Trans4x4TestBase {
  public:
   virtual ~Trans4x4TestBase() {}
@@ -57,7 +61,7 @@
 
   virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
 
-  void RunAccuracyCheck() {
+  void RunAccuracyCheck(int limit) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     uint32_t max_error = 0;
     int64_t total_error = 0;
@@ -88,11 +92,13 @@
       }
     }
 
-    EXPECT_GE(1u, max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > 1";
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > "
+        << limit;
 
-    EXPECT_GE(count_test_block , total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > 1 per block";
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
   }
 
   void RunCoeffCheck() {
@@ -150,7 +156,7 @@
     }
   }
 
-  void RunInvAccuracyCheck() {
+  void RunInvAccuracyCheck(int limit) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
@@ -173,8 +179,8 @@
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error)
-            << "Error: 16x16 IDCT has error " << error
+        EXPECT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error
             << " at index " << j;
       }
     }
@@ -213,7 +219,7 @@
 };
 
 TEST_P(Trans4x4DCT, AccuracyCheck) {
-  RunAccuracyCheck();
+  RunAccuracyCheck(1);
 }
 
 TEST_P(Trans4x4DCT, CoeffCheck) {
@@ -225,7 +231,7 @@
 }
 
 TEST_P(Trans4x4DCT, InvAccuracyCheck) {
-  RunInvAccuracyCheck();
+  RunInvAccuracyCheck(1);
 }
 
 class Trans4x4HT
@@ -257,7 +263,7 @@
 };
 
 TEST_P(Trans4x4HT, AccuracyCheck) {
-  RunAccuracyCheck();
+  RunAccuracyCheck(1);
 }
 
 TEST_P(Trans4x4HT, CoeffCheck) {
@@ -269,9 +275,51 @@
 }
 
 TEST_P(Trans4x4HT, InvAccuracyCheck) {
-  RunInvAccuracyCheck();
+  RunInvAccuracyCheck(1);
 }
 
+class Trans4x4WHT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<dct_4x4_param_t> {
+ public:
+  virtual ~Trans4x4WHT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fwht4x4_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) {
+  RunAccuracyCheck(0);
+}
+
+TEST_P(Trans4x4WHT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4WHT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) {
+  RunInvAccuracyCheck(0);
+}
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -285,8 +333,12 @@
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4DCT,
     ::testing::Values(
@@ -301,6 +353,13 @@
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
 #endif
 
+#if CONFIG_USE_X86INC && HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+    MMX, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 6f2d7d1..c7cf164 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -313,7 +313,7 @@
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
@@ -340,4 +340,11 @@
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
 #endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+#endif
 }  // namespace
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 8849ce6..79ef521 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -132,15 +132,15 @@
                    &vp9_idct16x16_1_add_c,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_c,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_c,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_c,
                    TX_8X8, 1),
         make_tuple(&vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(
@@ -154,8 +154,8 @@
                    &vp9_idct16x16_1_add_neon,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_neon,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_neon,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_neon,
                    TX_8X8, 1),
@@ -181,8 +181,8 @@
                    &vp9_idct16x16_1_add_sse2,
                    TX_16X16, 1),
         make_tuple(&vp9_idct8x8_64_add_c,
-                   &vp9_idct8x8_10_add_sse2,
-                   TX_8X8, 10),
+                   &vp9_idct8x8_12_add_sse2,
+                   TX_8X8, 12),
         make_tuple(&vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_sse2,
                    TX_8X8, 1),
@@ -190,4 +190,13 @@
                    &vp9_idct4x4_1_add_sse2,
                    TX_4X4, 1)));
 #endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_12_add_ssse3,
+                   TX_8X8, 12)));
+#endif
 }  // namespace
diff --git a/test/postproc.sh b/test/postproc.sh
new file mode 100755
index 0000000..050a368
--- /dev/null
+++ b/test/postproc.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx postproc example code. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to postproc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+postproc_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs postproc using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+postproc() {
+  local decoder="${LIBVPX_BIN_PATH}/postproc${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/postproc_${codec}.raw"
+
+  [ -x "${decoder}" ] || return 1
+
+  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+postproc_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    postproc "${VP8_IVF_FILE}" vp8 || return 1
+  fi
+}
+
+postproc_vp9() {
+  if [ "$(vpx_config_option_enabled CONFIG_VP9_POSTPROC)" = "yes" ]; then
+    if [ "$(vp9_decode_available)" = "yes" ]; then
+      postproc "${VP9_IVF_FILE}" vp9 || return 1
+    fi
+  fi
+}
+
+postproc_tests="postproc_vp8
+                postproc_vp9"
+
+run_tests postproc_verify_environment "${postproc_tests}"
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 7e3d053..1ee149b 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -82,8 +82,8 @@
 
 }  // namespace libvpx_test
 
-#elif defined(CONFIG_SHARED) && defined(HAVE_NEON) \
-      && !CONFIG_SHARED && HAVE_NEON
+#elif defined(CONFIG_SHARED) && defined(HAVE_NEON_ASM) && defined(CONFIG_VP9) \
+      && !CONFIG_SHARED && HAVE_NEON_ASM && CONFIG_VP9
 
 #include "vpx/vpx_integer.h"
 
diff --git a/test/simple_decoder.sh b/test/simple_decoder.sh
index a0db58f..24b17c5 100755
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh
@@ -34,7 +34,7 @@
 
   [ -x "${decoder}" ] || return 1
 
-  "${decoder}" "${input_file}" "${output_file}" > /dev/null 2>&1
+  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
new file mode 100755
index 0000000..6232093
--- /dev/null
+++ b/test/simple_encoder.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx simple_encoder example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to simple_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+simple_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs simple_encoder using the codec specified by $1.
+simple_encoder() {
+  local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"
+
+  [ -x "${encoder}" ] || return 1
+
+  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+simple_encoder_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    simple_encoder vp8 || return 1
+  fi
+}
+
+# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
+# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
+# machine.
+DISABLED_simple_encoder_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    simple_encoder vp9 || return 1
+  fi
+}
+
+simple_encoder_tests="simple_encoder_vp8
+                      DISABLED_simple_encoder_vp9"
+
+run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 3efb955..63e999d 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -105,7 +105,7 @@
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_c));
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_neon));
 #endif
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index cf2ad1e..9c23929 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -635,3 +635,8 @@
 1765315acccfe6cd12230e731369fcb15325ebfa  vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
 4a2b7a683576fe8e330c7d1c4f098ff4e70a43a8  vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
 1ef480392112b3509cb190afbb96f9a38dd9fbac  vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+e615575ded499ea1d992f3b38e3baa434509cdcd  vp90-2-15-segkey.webm
+e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
+9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
+8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
+
diff --git a/test/test.mk b/test/test.mk
index da56b00..44d2f9c 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -43,15 +43,13 @@
 
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
-NESTEGG_SRCS                           += ../third_party/nestegg/halloc/halloc.h
-NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/align.h
-NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/halloc.c
-NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/hlist.h
-NESTEGG_SRCS                           += ../third_party/nestegg/include/nestegg/nestegg.h
-NESTEGG_SRCS                           += ../third_party/nestegg/src/nestegg.c
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(NESTEGG_SRCS)
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.cpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.hpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.hpp
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.c
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif
@@ -750,6 +748,10 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index ff3c389..fd8c4c3 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -177,7 +177,8 @@
   "vp90-2-14-resize-fp-tiles-4-16.webm", "vp90-2-14-resize-fp-tiles-4-1.webm",
   "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
   "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
-  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm"
+  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
+  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm"
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 45b7771..9c10d48 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -9,7 +9,19 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 ##  This file contains shell code shared by test scripts for libvpx tools.
+
+# Use $VPX_TEST_TOOLS_COMMON_SH as a pseudo include guard.
+if [ -z "${VPX_TEST_TOOLS_COMMON_SH}" ]; then
+VPX_TEST_TOOLS_COMMON_SH=included
+
 set -e
+devnull='> /dev/null 2>&1'
+
+vlog() {
+  if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
+    echo "$@"
+  fi
+}
 
 # Sets $VPX_TOOL_TEST to the name specified by positional parameter one.
 test_begin() {
@@ -188,9 +200,9 @@
   local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
 
   if [ -z "${pipe_input}" ]; then
-    "${decoder}" "$input" --summary --noblit "$@" > /dev/null 2>&1
+    eval "${decoder}" "$input" --summary --noblit "$@" ${devnull}
   else
-    cat "${input}" | "${decoder}" - --summary --noblit "$@" > /dev/null 2>&1
+    cat "${input}" | eval "${decoder}" - --summary --noblit "$@" ${devnull}
   fi
 }
 
@@ -236,14 +248,16 @@
   fi
 
   if [ -z "${pipe_input}" ]; then
-    "${encoder}" --codec=${codec} --width=${width} --height=${height} \
+    eval "${encoder}" --codec=${codec} --width=${width} --height=${height} \
         --limit=${frames} ${use_ivf} ${extra_flags} --output="${output}" \
-        "${input}" > /dev/null 2>&1
+        "${input}" \
+        ${devnull}
   else
     cat "${input}" \
-        | "${encoder}" --codec=${codec} --width=${width} --height=${height} \
-            --limit=${frames} ${use_ivf} ${extra_flags} --output="${output}" - \
-            > /dev/null 2>&1
+        | eval "${encoder}" --codec=${codec} --width=${width} \
+              --height=${height} --limit=${frames} ${use_ivf} ${extra_flags} \
+              --output="${output}" - \
+              ${devnull}
   fi
 
   if [ ! -e "${output}" ]; then
@@ -308,9 +322,9 @@
   # Run tests.
   for test in ${tests_to_run}; do
     test_begin "${test}"
-    [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ] && echo "  RUN  ${test}"
+    vlog "  RUN  ${test}"
     "${test}"
-    [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ] && echo "  PASS ${test}"
+    vlog "  PASS ${test}"
     test_end "${test}"
   done
 
@@ -327,6 +341,7 @@
     --run-disabled-tests: Run disabled tests.
     --help: Display this message and exit.
     --test-data-path <path to libvpx test data directory>
+    --show-program-output: Shows output from all programs being tested.
     --verbose: Verbose output.
 
     When the --bin-path option is not specified the script attempts to use
@@ -379,6 +394,9 @@
     --verbose)
       VPX_TEST_VERBOSE_OUTPUT=yes
       ;;
+    --show-program-output)
+      devnull=
+      ;;
     *)
       vpx_test_usage
       exit 1
@@ -429,9 +447,7 @@
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT
 
-if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
-cat << EOF
-$(basename "${0%.*}") test configuration:
+vlog "$(basename "${0%.*}") test configuration:
   LIBVPX_BIN_PATH=${LIBVPX_BIN_PATH}
   LIBVPX_CONFIG_PATH=${LIBVPX_CONFIG_PATH}
   LIBVPX_TEST_DATA_PATH=${LIBVPX_TEST_DATA_PATH}
@@ -439,5 +455,6 @@
   VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
   VPX_TEST_FILTER=${VPX_TEST_FILTER}
   VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
-EOF
-fi
+  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}"
+
+fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.
diff --git a/test/vp8cx_set_ref.sh b/test/vp8cx_set_ref.sh
new file mode 100755
index 0000000..ef9d0c0
--- /dev/null
+++ b/test/vp8cx_set_ref.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp8cx_set_ref example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp8cx_set_ref_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8cx_set_ref_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs vp8cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name, which vp8cx_set_ref does not support at present: It's
+# currently used only to name the output file.
+# TODO(tomfinegan): Pass the codec param once the example is updated to support
+# VP9.
+vpx_set_ref() {
+  local encoder="${LIBVPX_BIN_PATH}/vp8cx_set_ref${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
+  local ref_frame_num=90
+
+  [ -x "${encoder}" ] || return 1
+
+  eval "${encoder}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
+      "${YUV_RAW_INPUT}" "${output_file}" "${ref_frame_num}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+vp8cx_set_ref_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_set_ref vp8 || return 1
+  fi
+}
+
+vp8cx_set_ref_tests="vp8cx_set_ref_vp8"
+
+run_tests vp8cx_set_ref_verify_environment "${vp8cx_set_ref_tests}"
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
new file mode 100644
index 0000000..13868b6
--- /dev/null
+++ b/third_party/libwebm/Android.mk
@@ -0,0 +1,11 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cpp
+LOCAL_SRC_FILES := mkvmuxer.cpp \
+                   mkvmuxerutil.cpp \
+                   mkvparser.cpp \
+                   mkvreader.cpp \
+                   mkvwriter.cpp
+LOCAL_MODULE := libwebm
+include $(BUILD_STATIC_LIBRARY)
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 2c7570d..93814b7 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: a7118d8ec564e9db841da1eb01f547f3229f240a
+Version: 249629d46c6e9391f25a90cff6d19075f47474cb
 License: BSD
 License File: LICENSE.txt
 
diff --git a/third_party/libwebm/mkvmuxer.cpp b/third_party/libwebm/mkvmuxer.cpp
index 8ae0dda..45167ea 100644
--- a/third_party/libwebm/mkvmuxer.cpp
+++ b/third_party/libwebm/mkvmuxer.cpp
@@ -22,7 +22,7 @@
 
 #ifdef _MSC_VER
 // Disable MSVC warnings that suggest making code non-portable.
-#pragma warning(disable:4996)
+#pragma warning(disable : 4996)
 #endif
 
 namespace mkvmuxer {
@@ -40,7 +40,7 @@
 
   char*& dst = *dst_ptr;
 
-  delete [] dst;
+  delete[] dst;
   dst = NULL;
 
   if (src == NULL)
@@ -61,11 +61,9 @@
 //
 // IMkvWriter Class
 
-IMkvWriter::IMkvWriter() {
-}
+IMkvWriter::IMkvWriter() {}
 
-IMkvWriter::~IMkvWriter() {
-}
+IMkvWriter::~IMkvWriter() {}
 
 bool WriteEbmlHeader(IMkvWriter* writer) {
   // Level 0
@@ -97,8 +95,7 @@
   return true;
 }
 
-bool ChunkedCopy(mkvparser::IMkvReader* source,
-                 mkvmuxer::IMkvWriter* dst,
+bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
                  mkvmuxer::int64 start, int64 size) {
   // TODO(vigneshv): Check if this is a reasonable value.
   const uint32 kBufSize = 2048;
@@ -130,12 +127,11 @@
       length_(0),
       track_number_(0),
       timestamp_(0),
-      discard_padding_(0) {
-}
+      discard_padding_(0) {}
 
 Frame::~Frame() {
-  delete [] frame_;
-  delete [] additional_;
+  delete[] frame_;
+  delete[] additional_;
 }
 
 bool Frame::Init(const uint8* frame, uint64 length) {
@@ -144,7 +140,7 @@
   if (!data)
     return false;
 
-  delete [] frame_;
+  delete[] frame_;
   frame_ = data;
   length_ = length;
 
@@ -159,7 +155,7 @@
   if (!data)
     return false;
 
-  delete [] additional_;
+  delete[] additional_;
   additional_ = data;
   additional_length_ = length;
   add_id_ = add_id;
@@ -177,11 +173,9 @@
       track_(0),
       cluster_pos_(0),
       block_number_(1),
-      output_block_number_(true) {
-}
+      output_block_number_(true) {}
 
-CuePoint::~CuePoint() {
-}
+CuePoint::~CuePoint() {}
 
 bool CuePoint::Write(IMkvWriter* writer) const {
   if (!writer || track_ < 1 || cluster_pos_ < 1)
@@ -191,10 +185,10 @@
   size += EbmlElementSize(kMkvCueTrack, track_);
   if (output_block_number_ && block_number_ > 1)
     size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
-  const uint64 track_pos_size = EbmlMasterElementSize(kMkvCueTrackPositions,
-                                                      size) + size;
-  const uint64 payload_size = EbmlElementSize(kMkvCueTime, time_) +
-                              track_pos_size;
+  const uint64 track_pos_size =
+      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
+  const uint64 payload_size =
+      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
 
   if (!WriteEbmlMasterElement(writer, kMkvCuePoint, payload_size))
     return false;
@@ -231,10 +225,10 @@
   size += EbmlElementSize(kMkvCueTrack, track_);
   if (output_block_number_ && block_number_ > 1)
     size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
-  const uint64 track_pos_size = EbmlMasterElementSize(kMkvCueTrackPositions,
-                                                      size) + size;
-  const uint64 payload_size = EbmlElementSize(kMkvCueTime, time_) +
-                              track_pos_size;
+  const uint64 track_pos_size =
+      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
+  const uint64 payload_size =
+      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
 
   return payload_size;
 }
@@ -252,8 +246,7 @@
     : cue_entries_capacity_(0),
       cue_entries_size_(0),
       cue_entries_(NULL),
-      output_block_number_(true) {
-}
+      output_block_number_(true) {}
 
 Cues::~Cues() {
   if (cue_entries_) {
@@ -261,7 +254,7 @@
       CuePoint* const cue = cue_entries_[i];
       delete cue;
     }
-    delete [] cue_entries_;
+    delete[] cue_entries_;
   }
 }
 
@@ -278,7 +271,7 @@
       return false;
 
     CuePoint** const cues =
-        new (std::nothrow) CuePoint*[new_capacity];  // NOLINT
+        new (std::nothrow) CuePoint* [new_capacity];  // NOLINT
     if (!cues)
       return false;
 
@@ -286,7 +279,7 @@
       cues[i] = cue_entries_[i];
     }
 
-    delete [] cue_entries_;
+    delete[] cue_entries_;
 
     cue_entries_ = cues;
     cue_entries_capacity_ = new_capacity;
@@ -402,18 +395,15 @@
       encoding_order_(0),
       encoding_scope_(1),
       encoding_type_(1),
-      enc_key_id_length_(0) {
-}
+      enc_key_id_length_(0) {}
 
-ContentEncoding::~ContentEncoding() {
-  delete [] enc_key_id_;
-}
+ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
 
 bool ContentEncoding::SetEncryptionID(const uint8* id, uint64 length) {
   if (!id || length < 1)
     return false;
 
-  delete [] enc_key_id_;
+  delete[] enc_key_id_;
 
   enc_key_id_ =
       new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
@@ -429,9 +419,8 @@
 uint64 ContentEncoding::Size() const {
   const uint64 encryption_size = EncryptionSize();
   const uint64 encoding_size = EncodingSize(0, encryption_size);
-  const uint64 encodings_size = EbmlMasterElementSize(kMkvContentEncoding,
-                                                      encoding_size) +
-                                encoding_size;
+  const uint64 encodings_size =
+      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
 
   return encodings_size;
 }
@@ -439,9 +428,8 @@
 bool ContentEncoding::Write(IMkvWriter* writer) const {
   const uint64 encryption_size = EncryptionSize();
   const uint64 encoding_size = EncodingSize(0, encryption_size);
-  const uint64 size = EbmlMasterElementSize(kMkvContentEncoding,
-                                            encoding_size) +
-                      encoding_size;
+  const uint64 size =
+      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
 
   const int64 payload_position = writer->Position();
   if (payload_position < 0)
@@ -460,9 +448,7 @@
     return false;
   if (!WriteEbmlElement(writer, kMkvContentEncAlgo, enc_algo_))
     return false;
-  if (!WriteEbmlElement(writer,
-                        kMkvContentEncKeyID,
-                        enc_key_id_,
+  if (!WriteEbmlElement(writer, kMkvContentEncKeyID, enc_key_id_,
                         enc_key_id_length_))
     return false;
 
@@ -486,9 +472,9 @@
   uint64 encoding_size = 0;
 
   if (encryption_size > 0) {
-    encoding_size += EbmlMasterElementSize(kMkvContentEncryption,
-                                           encryption_size) +
-                     encryption_size;
+    encoding_size +=
+        EbmlMasterElementSize(kMkvContentEncryption, encryption_size) +
+        encryption_size;
   }
   encoding_size += EbmlElementSize(kMkvContentEncodingType, encoding_type_);
   encoding_size += EbmlElementSize(kMkvContentEncodingScope, encoding_scope_);
@@ -500,9 +486,8 @@
 uint64 ContentEncoding::EncryptionSize() const {
   const uint64 aes_size = enc_aes_settings_.Size();
 
-  uint64 encryption_size = EbmlElementSize(kMkvContentEncKeyID,
-                                           enc_key_id_,
-                                           enc_key_id_length_);
+  uint64 encryption_size =
+      EbmlElementSize(kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_);
   encryption_size += EbmlElementSize(kMkvContentEncAlgo, enc_algo_);
 
   return encryption_size + aes_size;
@@ -523,23 +508,23 @@
       uid_(MakeUID(seed)),
       codec_delay_(0),
       seek_pre_roll_(0),
+      default_duration_(0),
       codec_private_length_(0),
       content_encoding_entries_(NULL),
-      content_encoding_entries_size_(0) {
-}
+      content_encoding_entries_size_(0) {}
 
 Track::~Track() {
-  delete [] codec_id_;
-  delete [] codec_private_;
-  delete [] language_;
-  delete [] name_;
+  delete[] codec_id_;
+  delete[] codec_private_;
+  delete[] language_;
+  delete[] name_;
 
   if (content_encoding_entries_) {
     for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
       ContentEncoding* const encoding = content_encoding_entries_[i];
       delete encoding;
     }
-    delete [] content_encoding_entries_;
+    delete[] content_encoding_entries_;
   }
 }
 
@@ -547,14 +532,14 @@
   const uint32 count = content_encoding_entries_size_ + 1;
 
   ContentEncoding** const content_encoding_entries =
-      new (std::nothrow) ContentEncoding*[count];  // NOLINT
+      new (std::nothrow) ContentEncoding* [count];  // NOLINT
   if (!content_encoding_entries)
     return false;
 
   ContentEncoding* const content_encoding =
       new (std::nothrow) ContentEncoding();  // NOLINT
   if (!content_encoding) {
-    delete [] content_encoding_entries;
+    delete[] content_encoding_entries;
     return false;
   }
 
@@ -562,7 +547,7 @@
     content_encoding_entries[i] = content_encoding_entries_[i];
   }
 
-  delete [] content_encoding_entries_;
+  delete[] content_encoding_entries_;
 
   content_encoding_entries_ = content_encoding_entries;
   content_encoding_entries_[content_encoding_entries_size_] = content_encoding;
@@ -587,8 +572,7 @@
   if (codec_id_)
     size += EbmlElementSize(kMkvCodecID, codec_id_);
   if (codec_private_)
-    size += EbmlElementSize(kMkvCodecPrivate,
-                            codec_private_,
+    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
                             codec_private_length_);
   if (language_)
     size += EbmlElementSize(kMkvLanguage, language_);
@@ -600,6 +584,8 @@
     size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
   if (seek_pre_roll_)
     size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
+  if (default_duration_)
+    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
 
   if (content_encoding_entries_size_ > 0) {
     uint64 content_encodings_size = 0;
@@ -608,9 +594,9 @@
       content_encodings_size += encoding->Size();
     }
 
-    size += EbmlMasterElementSize(kMkvContentEncodings,
-                                  content_encodings_size) +
-            content_encodings_size;
+    size +=
+        EbmlMasterElementSize(kMkvContentEncodings, content_encodings_size) +
+        content_encodings_size;
   }
 
   return size;
@@ -633,14 +619,17 @@
   if (!WriteEbmlMasterElement(writer, kMkvTrackEntry, payload_size))
     return false;
 
+  // |type_| has to be specified before the Track can be written.
+  if (!type_)
+    return false;
+
   uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
   size += EbmlElementSize(kMkvTrackUID, uid_);
   size += EbmlElementSize(kMkvTrackType, type_);
   if (codec_id_)
     size += EbmlElementSize(kMkvCodecID, codec_id_);
   if (codec_private_)
-    size += EbmlElementSize(kMkvCodecPrivate,
-                            codec_private_,
+    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
                             codec_private_length_);
   if (language_)
     size += EbmlElementSize(kMkvLanguage, language_);
@@ -652,7 +641,8 @@
     size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
   if (seek_pre_roll_)
     size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
-
+  if (default_duration_)
+    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
 
   const int64 payload_position = writer->Position();
   if (payload_position < 0)
@@ -665,8 +655,7 @@
   if (!WriteEbmlElement(writer, kMkvTrackType, type_))
     return false;
   if (max_block_additional_id_) {
-    if (!WriteEbmlElement(writer,
-                          kMkvMaxBlockAdditionID,
+    if (!WriteEbmlElement(writer, kMkvMaxBlockAdditionID,
                           max_block_additional_id_)) {
       return false;
     }
@@ -679,14 +668,16 @@
     if (!WriteEbmlElement(writer, kMkvSeekPreRoll, seek_pre_roll_))
       return false;
   }
+  if (default_duration_) {
+    if (!WriteEbmlElement(writer, kMkvDefaultDuration, default_duration_))
+      return false;
+  }
   if (codec_id_) {
     if (!WriteEbmlElement(writer, kMkvCodecID, codec_id_))
       return false;
   }
   if (codec_private_) {
-    if (!WriteEbmlElement(writer,
-                          kMkvCodecPrivate,
-                          codec_private_,
+    if (!WriteEbmlElement(writer, kMkvCodecPrivate, codec_private_,
                           codec_private_length_))
       return false;
   }
@@ -711,8 +702,7 @@
       content_encodings_size += encoding->Size();
     }
 
-    if (!WriteEbmlMasterElement(writer,
-                                kMkvContentEncodings,
+    if (!WriteEbmlMasterElement(writer, kMkvContentEncodings,
                                 content_encodings_size))
       return false;
 
@@ -733,7 +723,7 @@
   if (!codec_private || length < 1)
     return false;
 
-  delete [] codec_private_;
+  delete[] codec_private_;
 
   codec_private_ =
       new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
@@ -748,7 +738,7 @@
 
 void Track::set_codec_id(const char* codec_id) {
   if (codec_id) {
-    delete [] codec_id_;
+    delete[] codec_id_;
 
     const size_t length = strlen(codec_id) + 1;
     codec_id_ = new (std::nothrow) char[length];  // NOLINT
@@ -765,7 +755,7 @@
 // TODO(fgalligan): Vet the language parameter.
 void Track::set_language(const char* language) {
   if (language) {
-    delete [] language_;
+    delete[] language_;
 
     const size_t length = strlen(language) + 1;
     language_ = new (std::nothrow) char[length];  // NOLINT
@@ -781,7 +771,7 @@
 
 void Track::set_name(const char* name) {
   if (name) {
-    delete [] name_;
+    delete[] name_;
 
     const size_t length = strlen(name) + 1;
     name_ = new (std::nothrow) char[length];  // NOLINT
@@ -807,15 +797,12 @@
       height_(0),
       stereo_mode_(0),
       alpha_mode_(0),
-      width_(0) {
-}
+      width_(0) {}
 
-VideoTrack::~VideoTrack() {
-}
+VideoTrack::~VideoTrack() {}
 
 bool VideoTrack::SetStereoMode(uint64 stereo_mode) {
-  if (stereo_mode != kMono &&
-      stereo_mode != kSideBySideLeftIsFirst &&
+  if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
       stereo_mode != kTopBottomRightIsFirst &&
       stereo_mode != kTopBottomLeftIsFirst &&
       stereo_mode != kSideBySideRightIsFirst)
@@ -826,8 +813,7 @@
 }
 
 bool VideoTrack::SetAlphaMode(uint64 alpha_mode) {
-  if (alpha_mode != kNoAlpha &&
-      alpha_mode != kAlpha)
+  if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
     return false;
 
   alpha_mode_ = alpha_mode;
@@ -873,8 +859,7 @@
     if (!WriteEbmlElement(writer, kMkvAlphaMode, alpha_mode_))
       return false;
   if (frame_rate_ > 0.0)
-    if (!WriteEbmlElement(writer,
-                          kMkvFrameRate,
+    if (!WriteEbmlElement(writer, kMkvFrameRate,
                           static_cast<float>(frame_rate_)))
       return false;
 
@@ -908,20 +893,15 @@
 // AudioTrack Class
 
 AudioTrack::AudioTrack(unsigned int* seed)
-    : Track(seed),
-      bit_depth_(0),
-      channels_(1),
-      sample_rate_(0.0) {
-}
+    : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {}
 
-AudioTrack::~AudioTrack() {
-}
+AudioTrack::~AudioTrack() {}
 
 uint64 AudioTrack::PayloadSize() const {
   const uint64 parent_size = Track::PayloadSize();
 
-  uint64 size = EbmlElementSize(kMkvSamplingFrequency,
-                                static_cast<float>(sample_rate_));
+  uint64 size =
+      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
   size += EbmlElementSize(kMkvChannels, channels_);
   if (bit_depth_ > 0)
     size += EbmlElementSize(kMkvBitDepth, bit_depth_);
@@ -935,8 +915,8 @@
     return false;
 
   // Calculate AudioSettings size.
-  uint64 size = EbmlElementSize(kMkvSamplingFrequency,
-                                static_cast<float>(sample_rate_));
+  uint64 size =
+      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
   size += EbmlElementSize(kMkvChannels, channels_);
   if (bit_depth_ > 0)
     size += EbmlElementSize(kMkvBitDepth, bit_depth_);
@@ -948,8 +928,7 @@
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer,
-                        kMkvSamplingFrequency,
+  if (!WriteEbmlElement(writer, kMkvSamplingFrequency,
                         static_cast<float>(sample_rate_)))
     return false;
   if (!WriteEbmlElement(writer, kMkvChannels, channels_))
@@ -975,11 +954,7 @@
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
 
-
-Tracks::Tracks()
-    : track_entries_(NULL),
-      track_entries_size_(0) {
-}
+Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0) {}
 
 Tracks::~Tracks() {
   if (track_entries_) {
@@ -987,7 +962,7 @@
       Track* const track = track_entries_[i];
       delete track;
     }
-    delete [] track_entries_;
+    delete[] track_entries_;
   }
 }
 
@@ -1015,7 +990,7 @@
 
   const uint32 count = track_entries_size_ + 1;
 
-  Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
+  Track** const track_entries = new (std::nothrow) Track* [count];  // NOLINT
   if (!track_entries)
     return false;
 
@@ -1023,7 +998,7 @@
     track_entries[i] = track_entries_[i];
   }
 
-  delete [] track_entries_;
+  delete[] track_entries_;
 
   // Find the lowest availible track number > 0.
   if (track_num == 0) {
@@ -1125,21 +1100,16 @@
 //
 // Chapter Class
 
-bool Chapter::set_id(const char* id) {
-  return StrCpy(id, &id_);
-}
+bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
 
-void Chapter::set_time(const Segment& segment,
-                       uint64 start_ns,
-                       uint64 end_ns) {
+void Chapter::set_time(const Segment& segment, uint64 start_ns, uint64 end_ns) {
   const SegmentInfo* const info = segment.GetSegmentInfo();
   const uint64 timecode_scale = info->timecode_scale();
   start_timecode_ = start_ns / timecode_scale;
   end_timecode_ = end_ns / timecode_scale;
 }
 
-bool Chapter::add_string(const char* title,
-                         const char* language,
+bool Chapter::add_string(const char* title, const char* language,
                          const char* country) {
   if (!ExpandDisplaysArray())
     return false;
@@ -1171,8 +1141,7 @@
   // active on the array.
 }
 
-Chapter::~Chapter() {
-}
+Chapter::~Chapter() {}
 
 void Chapter::Init(unsigned int* seed) {
   id_ = NULL;
@@ -1200,7 +1169,7 @@
     d.Clear();
   }
 
-  delete [] displays_;
+  delete[] displays_;
   displays_ = NULL;
 
   displays_size_ = 0;
@@ -1220,7 +1189,7 @@
     displays[idx] = displays_[idx];  // shallow copy
   }
 
-  delete [] displays_;
+  delete[] displays_;
 
   displays_ = displays;
   displays_size_ = size;
@@ -1229,11 +1198,10 @@
 }
 
 uint64 Chapter::WriteAtom(IMkvWriter* writer) const {
-  uint64 payload_size =
-      EbmlElementSize(kMkvChapterStringUID, id_) +
-      EbmlElementSize(kMkvChapterUID, uid_) +
-      EbmlElementSize(kMkvChapterTimeStart, start_timecode_) +
-      EbmlElementSize(kMkvChapterTimeEnd, end_timecode_);
+  uint64 payload_size = EbmlElementSize(kMkvChapterStringUID, id_) +
+                        EbmlElementSize(kMkvChapterUID, uid_) +
+                        EbmlElementSize(kMkvChapterTimeStart, start_timecode_) +
+                        EbmlElementSize(kMkvChapterTimeEnd, end_timecode_);
 
   for (int idx = 0; idx < displays_count_; ++idx) {
     const Display& d = displays_[idx];
@@ -1241,8 +1209,7 @@
   }
 
   const uint64 atom_size =
-      EbmlMasterElementSize(kMkvChapterAtom, payload_size) +
-      payload_size;
+      EbmlMasterElementSize(kMkvChapterAtom, payload_size) + payload_size;
 
   if (writer == NULL)
     return atom_size;
@@ -1313,8 +1280,7 @@
     payload_size += EbmlElementSize(kMkvChapCountry, country_);
 
   const uint64 display_size =
-      EbmlMasterElementSize(kMkvChapterDisplay, payload_size) +
-      payload_size;
+      EbmlMasterElementSize(kMkvChapterDisplay, payload_size) + payload_size;
 
   if (writer == NULL)
     return display_size;
@@ -1349,11 +1315,7 @@
 //
 // Chapters Class
 
-Chapters::Chapters()
-    : chapters_size_(0),
-      chapters_count_(0),
-      chapters_(NULL) {
-}
+Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {}
 
 Chapters::~Chapters() {
   while (chapters_count_ > 0) {
@@ -1361,13 +1323,11 @@
     chapter.Clear();
   }
 
-  delete [] chapters_;
+  delete[] chapters_;
   chapters_ = NULL;
 }
 
-int Chapters::Count() const {
-  return chapters_count_;
-}
+int Chapters::Count() const { return chapters_count_; }
 
 Chapter* Chapters::AddChapter(unsigned int* seed) {
   if (!ExpandChaptersArray())
@@ -1417,7 +1377,7 @@
     src.ShallowCopy(dst);
   }
 
-  delete [] chapters_;
+  delete[] chapters_;
 
   chapters_ = chapters;
   chapters_size_ = size;
@@ -1434,8 +1394,7 @@
   }
 
   const uint64 edition_size =
-      EbmlMasterElementSize(kMkvEditionEntry, payload_size) +
-      payload_size;
+      EbmlMasterElementSize(kMkvEditionEntry, payload_size) + payload_size;
 
   if (writer == NULL)  // return size only
     return edition_size;
@@ -1473,11 +1432,9 @@
       position_for_cues_(cues_pos),
       size_position_(-1),
       timecode_(timecode),
-      writer_(NULL) {
-}
+      writer_(NULL) {}
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 bool Cluster::Init(IMkvWriter* ptr_writer) {
   if (!ptr_writer) {
@@ -1487,69 +1444,39 @@
   return true;
 }
 
-bool Cluster::AddFrame(const uint8* frame,
-                       uint64 length,
-                       uint64 track_number,
-                       uint64 abs_timecode,
-                       bool is_key) {
-  return DoWriteBlock(frame,
-                      length,
-                      track_number,
-                      abs_timecode,
-                      is_key ? 1 : 0,
+bool Cluster::AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+                       uint64 abs_timecode, bool is_key) {
+  return DoWriteBlock(frame, length, track_number, abs_timecode, is_key ? 1 : 0,
                       &WriteSimpleBlock);
 }
 
-bool Cluster::AddFrameWithAdditional(const uint8* frame,
-                                     uint64 length,
+bool Cluster::AddFrameWithAdditional(const uint8* frame, uint64 length,
                                      const uint8* additional,
-                                     uint64 additional_length,
-                                     uint64 add_id,
-                                     uint64 track_number,
-                                     uint64 abs_timecode,
+                                     uint64 additional_length, uint64 add_id,
+                                     uint64 track_number, uint64 abs_timecode,
                                      bool is_key) {
-  return DoWriteBlockWithAdditional(frame,
-                                    length,
-                                    additional,
-                                    additional_length,
-                                    add_id,
-                                    track_number,
-                                    abs_timecode,
-                                    is_key ? 1 : 0,
-                                    &WriteBlockWithAdditional);
+  return DoWriteBlockWithAdditional(
+      frame, length, additional, additional_length, add_id, track_number,
+      abs_timecode, is_key ? 1 : 0, &WriteBlockWithAdditional);
 }
 
-bool Cluster::AddFrameWithDiscardPadding(const uint8* frame,
-                                         uint64 length,
+bool Cluster::AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
                                          int64 discard_padding,
                                          uint64 track_number,
-                                         uint64 abs_timecode,
-                                         bool is_key) {
-  return DoWriteBlockWithDiscardPadding(frame,
-                                        length,
-                                        discard_padding,
-                                        track_number,
-                                        abs_timecode,
-                                        is_key ? 1 : 0,
-                                        &WriteBlockWithDiscardPadding);
+                                         uint64 abs_timecode, bool is_key) {
+  return DoWriteBlockWithDiscardPadding(
+      frame, length, discard_padding, track_number, abs_timecode,
+      is_key ? 1 : 0, &WriteBlockWithDiscardPadding);
 }
 
-bool Cluster::AddMetadata(const uint8* frame,
-                          uint64 length,
-                          uint64 track_number,
-                          uint64 abs_timecode,
+bool Cluster::AddMetadata(const uint8* frame, uint64 length,
+                          uint64 track_number, uint64 abs_timecode,
                           uint64 duration_timecode) {
-  return DoWriteBlock(frame,
-                      length,
-                      track_number,
-                      abs_timecode,
-                      duration_timecode,
-                      &WriteMetadataBlock);
+  return DoWriteBlock(frame, length, track_number, abs_timecode,
+                      duration_timecode, &WriteMetadataBlock);
 }
 
-void Cluster::AddPayloadSize(uint64 size) {
-  payload_size_ += size;
-}
+void Cluster::AddPayloadSize(uint64 size) { payload_size_ += size; }
 
 bool Cluster::Finalize() {
   if (!writer_ || finalized_ || size_position_ == -1)
@@ -1575,8 +1502,7 @@
 
 uint64 Cluster::Size() const {
   const uint64 element_size =
-      EbmlMasterElementSize(kMkvCluster,
-                            0xFFFFFFFFFFFFFFFFULL) + payload_size_;
+      EbmlMasterElementSize(kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + payload_size_;
   return element_size;
 }
 
@@ -1616,13 +1542,9 @@
   return rel_timecode;
 }
 
-bool Cluster::DoWriteBlock(
-    const uint8* frame,
-    uint64 length,
-    uint64 track_number,
-    uint64 abs_timecode,
-    uint64 generic_arg,
-    WriteBlock write_block) {
+bool Cluster::DoWriteBlock(const uint8* frame, uint64 length,
+                           uint64 track_number, uint64 abs_timecode,
+                           uint64 generic_arg, WriteBlock write_block) {
   if (frame == NULL || length == 0)
     return false;
 
@@ -1636,12 +1558,8 @@
   if (!PreWriteBlock(write_block))
     return false;
 
-  const uint64 element_size = (*write_block)(writer_,
-                                             frame,
-                                             length,
-                                             track_number,
-                                             rel_timecode,
-                                             generic_arg);
+  const uint64 element_size = (*write_block)(
+      writer_, frame, length, track_number, rel_timecode, generic_arg);
   if (element_size == 0)
     return false;
 
@@ -1650,17 +1568,11 @@
 }
 
 bool Cluster::DoWriteBlockWithAdditional(
-    const uint8* frame,
-    uint64 length,
-    const uint8* additional,
-    uint64 additional_length,
-    uint64 add_id,
-    uint64 track_number,
-    uint64 abs_timecode,
-    uint64 generic_arg,
-    WriteBlockAdditional write_block) {
-  if (frame == NULL || length == 0 ||
-      additional == NULL || additional_length == 0)
+    const uint8* frame, uint64 length, const uint8* additional,
+    uint64 additional_length, uint64 add_id, uint64 track_number,
+    uint64 abs_timecode, uint64 generic_arg, WriteBlockAdditional write_block) {
+  if (frame == NULL || length == 0 || additional == NULL ||
+      additional_length == 0)
     return false;
 
   if (!IsValidTrackNumber(track_number))
@@ -1673,15 +1585,9 @@
   if (!PreWriteBlock(write_block))
     return false;
 
-  const uint64 element_size = (*write_block)(writer_,
-                                             frame,
-                                             length,
-                                             additional,
-                                             additional_length,
-                                             add_id,
-                                             track_number,
-                                             rel_timecode,
-                                             generic_arg);
+  const uint64 element_size =
+      (*write_block)(writer_, frame, length, additional, additional_length,
+                     add_id, track_number, rel_timecode, generic_arg);
   if (element_size == 0)
     return false;
 
@@ -1690,12 +1596,8 @@
 }
 
 bool Cluster::DoWriteBlockWithDiscardPadding(
-    const uint8* frame,
-    uint64 length,
-    int64 discard_padding,
-    uint64 track_number,
-    uint64 abs_timecode,
-    uint64 generic_arg,
+    const uint8* frame, uint64 length, int64 discard_padding,
+    uint64 track_number, uint64 abs_timecode, uint64 generic_arg,
     WriteBlockDiscardPadding write_block) {
   if (frame == NULL || length == 0 || discard_padding <= 0)
     return false;
@@ -1710,13 +1612,9 @@
   if (!PreWriteBlock(write_block))
     return false;
 
-  const uint64 element_size = (*write_block)(writer_,
-                                             frame,
-                                             length,
-                                             discard_padding,
-                                             track_number,
-                                             rel_timecode,
-                                             generic_arg);
+  const uint64 element_size =
+      (*write_block)(writer_, frame, length, discard_padding, track_number,
+                     rel_timecode, generic_arg);
   if (element_size == 0)
     return false;
 
@@ -1758,8 +1656,7 @@
   }
 }
 
-SeekHead::~SeekHead() {
-}
+SeekHead::~SeekHead() {}
 
 bool SeekHead::Finalize(IMkvWriter* writer) const {
   if (writer->Seekable()) {
@@ -1771,13 +1668,12 @@
 
     for (int32 i = 0; i < kSeekEntryCount; ++i) {
       if (seek_entry_id_[i] != 0) {
-        entry_size[i] = EbmlElementSize(
-            kMkvSeekID,
-            static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] =
+            EbmlElementSize(kMkvSeekID, static_cast<uint64>(seek_entry_id_[i]));
         entry_size[i] += EbmlElementSize(kMkvSeekPosition, seek_entry_pos_[i]);
 
-        payload_size += EbmlMasterElementSize(kMkvSeek, entry_size[i]) +
-                        entry_size[i];
+        payload_size +=
+            EbmlMasterElementSize(kMkvSeek, entry_size[i]) + entry_size[i];
       }
     }
 
@@ -1797,8 +1693,7 @@
         if (!WriteEbmlMasterElement(writer, kMkvSeek, entry_size[i]))
           return false;
 
-        if (!WriteEbmlElement(writer,
-                              kMkvSeekID,
+        if (!WriteEbmlElement(writer, kMkvSeekID,
                               static_cast<uint64>(seek_entry_id_[i])))
           return false;
 
@@ -1809,8 +1704,8 @@
 
     const uint64 total_entry_size = kSeekEntryCount * MaxEntrySize();
     const uint64 total_size =
-        EbmlMasterElementSize(kMkvSeekHead,
-                              total_entry_size) + total_entry_size;
+        EbmlMasterElementSize(kMkvSeekHead, total_entry_size) +
+        total_entry_size;
     const int64 size_left = total_size - (writer->Position() - start_pos_);
 
     const uint64 bytes_written = WriteVoidElement(writer, size_left);
@@ -1888,12 +1783,12 @@
       muxing_app_(NULL),
       timecode_scale_(1000000ULL),
       writing_app_(NULL),
-      duration_pos_(-1) {
-}
+      date_utc_(LLONG_MIN),
+      duration_pos_(-1) {}
 
 SegmentInfo::~SegmentInfo() {
-  delete [] muxing_app_;
-  delete [] writing_app_;
+  delete[] muxing_app_;
+  delete[] writing_app_;
 }
 
 bool SegmentInfo::Init() {
@@ -1904,26 +1799,16 @@
   GetVersion(&major, &minor, &build, &revision);
   char temp[256];
 #ifdef _MSC_VER
-  sprintf_s(temp,
-            sizeof(temp)/sizeof(temp[0]),
-            "libwebm-%d.%d.%d.%d",
-            major,
-            minor,
-            build,
-            revision);
+  sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+            minor, build, revision);
 #else
-  snprintf(temp,
-           sizeof(temp)/sizeof(temp[0]),
-           "libwebm-%d.%d.%d.%d",
-           major,
-           minor,
-           build,
-           revision);
+  snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+           minor, build, revision);
 #endif
 
   const size_t app_len = strlen(temp) + 1;
 
-  delete [] muxing_app_;
+  delete[] muxing_app_;
 
   muxing_app_ = new (std::nothrow) char[app_len];  // NOLINT
   if (!muxing_app_)
@@ -1955,8 +1840,7 @@
       if (writer->Position(duration_pos_))
         return false;
 
-      if (!WriteEbmlElement(writer,
-                            kMkvDuration,
+      if (!WriteEbmlElement(writer, kMkvDuration,
                             static_cast<float>(duration_)))
         return false;
 
@@ -1975,6 +1859,8 @@
   uint64 size = EbmlElementSize(kMkvTimecodeScale, timecode_scale_);
   if (duration_ > 0.0)
     size += EbmlElementSize(kMkvDuration, static_cast<float>(duration_));
+  if (date_utc_ != LLONG_MIN)
+    size += EbmlDateElementSize(kMkvDateUTC, date_utc_);
   size += EbmlElementSize(kMkvMuxingApp, muxing_app_);
   size += EbmlElementSize(kMkvWritingApp, writing_app_);
 
@@ -1996,6 +1882,9 @@
       return false;
   }
 
+  if (date_utc_ != LLONG_MIN)
+    WriteEbmlDateElement(writer, kMkvDateUTC, date_utc_);
+
   if (!WriteEbmlElement(writer, kMkvMuxingApp, muxing_app_))
     return false;
   if (!WriteEbmlElement(writer, kMkvWritingApp, writing_app_))
@@ -2022,7 +1911,7 @@
     strcpy(temp_str, app);
 #endif
 
-    delete [] muxing_app_;
+    delete[] muxing_app_;
     muxing_app_ = temp_str;
   }
 }
@@ -2040,7 +1929,7 @@
     strcpy(temp_str, app);
 #endif
 
-    delete [] writing_app_;
+    delete[] writing_app_;
     writing_app_ = temp_str;
   }
 }
@@ -2093,7 +1982,7 @@
       Cluster* const cluster = cluster_list_[i];
       delete cluster;
     }
-    delete [] cluster_list_;
+    delete[] cluster_list_;
   }
 
   if (frames_) {
@@ -2101,11 +1990,11 @@
       Frame* const frame = frames_[i];
       delete frame;
     }
-    delete [] frames_;
+    delete[] frames_;
   }
 
-  delete [] chunk_name_;
-  delete [] chunking_base_name_;
+  delete[] chunk_name_;
+  delete[] chunking_base_name_;
 
   if (chunk_writer_cluster_) {
     chunk_writer_cluster_->Close();
@@ -2121,8 +2010,7 @@
   }
 }
 
-void Segment::MoveCuesBeforeClustersHelper(uint64 diff,
-                                           int32 index,
+void Segment::MoveCuesBeforeClustersHelper(uint64 diff, int32 index,
                                            uint64* cues_size) {
   const uint64 old_cues_size = *cues_size;
   CuePoint* const cue_point = cues_.GetCueByIndex(index);
@@ -2139,9 +2027,9 @@
   //    Let d = a + b + c. Now d is the new size of the Cues element which is
   //                       passed on to the next recursive call.
   const uint64 cue_point_size_diff = cue_point->Size() - old_cue_point_size;
-  const uint64 cue_size_diff = GetCodedUIntSize(*cues_size +
-                                                cue_point_size_diff) -
-                               GetCodedUIntSize(*cues_size);
+  const uint64 cue_size_diff =
+      GetCodedUIntSize(*cues_size + cue_point_size_diff) -
+      GetCodedUIntSize(*cues_size);
   *cues_size += cue_point_size_diff + cue_size_diff;
   diff = *cues_size - old_cues_size;
   if (diff > 0) {
@@ -2187,8 +2075,8 @@
                                             IMkvWriter* writer) {
   if (!writer->Seekable() || chunking_)
     return false;
-  const int64 cluster_offset = cluster_list_[0]->size_position() -
-                               GetUIntSize(kMkvCluster);
+  const int64 cluster_offset =
+      cluster_list_[0]->size_position() - GetUIntSize(kMkvCluster);
 
   // Copy the headers.
   if (!ChunkedCopy(reader, writer, 0, cluster_offset))
@@ -2214,8 +2102,7 @@
   const int64 pos = writer->Position();
   const int64 segment_size = writer->Position() - payload_pos_;
   if (writer->Position(size_position_) ||
-      WriteUIntSize(writer, segment_size, 8) ||
-      writer->Position(pos))
+      WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
     return false;
   return true;
 }
@@ -2227,7 +2114,7 @@
   if (mode_ == kFile) {
     if (cluster_list_size_ > 0) {
       // Update last cluster's size
-      Cluster* const old_cluster = cluster_list_[cluster_list_size_-1];
+      Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
 
       if (!old_cluster || !old_cluster->Finalize())
         return false;
@@ -2258,7 +2145,7 @@
         return false;
 
       const bool cues_open = chunk_writer_cues_->Open(name);
-      delete [] name;
+      delete[] name;
       if (!cues_open)
         return false;
     }
@@ -2321,9 +2208,7 @@
   return track;
 }
 
-Chapter* Segment::AddChapter() {
-  return chapters_.AddChapter(&seed_);
-}
+Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
 
 uint64 Segment::AddVideoTrack(int32 width, int32 height, int32 number) {
   VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
@@ -2342,10 +2227,10 @@
 }
 
 bool Segment::AddCuePoint(uint64 timestamp, uint64 track) {
-  if (cluster_list_size_  < 1)
+  if (cluster_list_size_ < 1)
     return false;
 
-  const Cluster* const cluster = cluster_list_[cluster_list_size_-1];
+  const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
   if (!cluster)
     return false;
 
@@ -2364,9 +2249,7 @@
   return true;
 }
 
-uint64 Segment::AddAudioTrack(int32 sample_rate,
-                              int32 channels,
-                              int32 number) {
+uint64 Segment::AddAudioTrack(int32 sample_rate, int32 channels, int32 number) {
   AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_);  // NOLINT
   if (!track)
     return 0;
@@ -2381,11 +2264,8 @@
   return track->number();
 }
 
-bool Segment::AddFrame(const uint8* frame,
-                       uint64 length,
-                       uint64 track_number,
-                       uint64 timestamp,
-                       bool is_key) {
+bool Segment::AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+                       uint64 timestamp, bool is_key) {
   if (!frame)
     return false;
 
@@ -2426,11 +2306,7 @@
   const uint64 timecode_scale = segment_info_.timecode_scale();
   const uint64 abs_timecode = timestamp / timecode_scale;
 
-  if (!cluster->AddFrame(frame,
-                         length,
-                         track_number,
-                         abs_timecode,
-                         is_key))
+  if (!cluster->AddFrame(frame, length, track_number, abs_timecode, is_key))
     return false;
 
   if (new_cuepoint_ && cues_track_ == track_number) {
@@ -2444,13 +2320,10 @@
   return true;
 }
 
-bool Segment::AddFrameWithAdditional(const uint8* frame,
-                                     uint64 length,
+bool Segment::AddFrameWithAdditional(const uint8* frame, uint64 length,
                                      const uint8* additional,
-                                     uint64 additional_length,
-                                     uint64 add_id,
-                                     uint64 track_number,
-                                     uint64 timestamp,
+                                     uint64 additional_length, uint64 add_id,
+                                     uint64 track_number, uint64 timestamp,
                                      bool is_key) {
   if (frame == NULL || additional == NULL)
     return false;
@@ -2492,14 +2365,9 @@
   const uint64 timecode_scale = segment_info_.timecode_scale();
   const uint64 abs_timecode = timestamp / timecode_scale;
 
-  if (!cluster->AddFrameWithAdditional(frame,
-                                       length,
-                                       additional,
-                                       additional_length,
-                                       add_id,
-                                       track_number,
-                                       abs_timecode,
-                                       is_key))
+  if (!cluster->AddFrameWithAdditional(frame, length, additional,
+                                       additional_length, add_id, track_number,
+                                       abs_timecode, is_key))
     return false;
 
   if (new_cuepoint_ && cues_track_ == track_number) {
@@ -2513,11 +2381,9 @@
   return true;
 }
 
-bool Segment::AddFrameWithDiscardPadding(const uint8* frame,
-                                         uint64 length,
+bool Segment::AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
                                          int64 discard_padding,
-                                         uint64 track_number,
-                                         uint64 timestamp,
+                                         uint64 track_number, uint64 timestamp,
                                          bool is_key) {
   if (frame == NULL || discard_padding <= 0)
     return false;
@@ -2560,11 +2426,8 @@
   const uint64 timecode_scale = segment_info_.timecode_scale();
   const uint64 abs_timecode = timestamp / timecode_scale;
 
-  if (!cluster->AddFrameWithDiscardPadding(frame, length,
-                                           discard_padding,
-                                           track_number,
-                                           abs_timecode,
-                                           is_key)) {
+  if (!cluster->AddFrameWithDiscardPadding(
+          frame, length, discard_padding, track_number, abs_timecode, is_key)) {
     return false;
   }
 
@@ -2579,10 +2442,8 @@
   return true;
 }
 
-bool Segment::AddMetadata(const uint8* frame,
-                          uint64 length,
-                          uint64 track_number,
-                          uint64 timestamp_ns,
+bool Segment::AddMetadata(const uint8* frame, uint64 length,
+                          uint64 track_number, uint64 timestamp_ns,
                           uint64 duration_ns) {
   if (!frame)
     return false;
@@ -2600,7 +2461,7 @@
   if (cluster_list_size_ < 1)
     return false;
 
-  Cluster* const cluster = cluster_list_[cluster_list_size_-1];
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
 
   if (!cluster)
     return false;
@@ -2609,10 +2470,7 @@
   const uint64 abs_timecode = timestamp_ns / timecode_scale;
   const uint64 duration_timecode = duration_ns / timecode_scale;
 
-  if (!cluster->AddMetadata(frame,
-                            length,
-                            track_number,
-                            abs_timecode,
+  if (!cluster->AddMetadata(frame, length, track_number, abs_timecode,
                             duration_timecode))
     return false;
 
@@ -2625,40 +2483,25 @@
 bool Segment::AddGenericFrame(const Frame* frame) {
   last_block_duration_ = frame->duration();
   if (!tracks_.TrackIsAudio(frame->track_number()) &&
-      !tracks_.TrackIsVideo(frame->track_number()) &&
-      frame->duration() > 0) {
-    return AddMetadata(frame->frame(),
-                       frame->length(),
-                       frame->track_number(),
-                       frame->timestamp(),
-                       frame->duration());
+      !tracks_.TrackIsVideo(frame->track_number()) && frame->duration() > 0) {
+    return AddMetadata(frame->frame(), frame->length(), frame->track_number(),
+                       frame->timestamp(), frame->duration());
   } else if (frame->additional() && frame->additional_length() > 0) {
-    return AddFrameWithAdditional(frame->frame(),
-                                  frame->length(),
-                                  frame->additional(),
-                                  frame->additional_length(),
-                                  frame->add_id(),
-                                  frame->track_number(),
-                                  frame->timestamp(),
-                                  frame->is_key());
+    return AddFrameWithAdditional(
+        frame->frame(), frame->length(), frame->additional(),
+        frame->additional_length(), frame->add_id(), frame->track_number(),
+        frame->timestamp(), frame->is_key());
   } else if (frame->discard_padding() > 0) {
-    return AddFrameWithDiscardPadding(frame->frame(), frame->length(),
-                                      frame->discard_padding(),
-                                      frame->track_number(),
-                                      frame->timestamp(),
-                                      frame->is_key());
+    return AddFrameWithDiscardPadding(
+        frame->frame(), frame->length(), frame->discard_padding(),
+        frame->track_number(), frame->timestamp(), frame->is_key());
   } else {
-    return AddFrame(frame->frame(),
-                    frame->length(),
-                    frame->track_number(),
-                    frame->timestamp(),
-                    frame->is_key());
+    return AddFrame(frame->frame(), frame->length(), frame->track_number(),
+                    frame->timestamp(), frame->is_key());
   }
 }
 
-void Segment::OutputCues(bool output_cues) {
-  output_cues_ = output_cues;
-}
+void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
 
 bool Segment::SetChunking(bool chunking, const char* filename) {
   if (chunk_count_ > 0)
@@ -2683,7 +2526,7 @@
     strcpy(temp, filename);
 #endif
 
-    delete [] chunking_base_name_;
+    delete[] chunking_base_name_;
     chunking_base_name_ = temp;
 
     if (!UpdateChunkName("chk", &chunk_name_))
@@ -2723,7 +2566,7 @@
     strcat(header, ".hdr");
 #endif
     if (!chunk_writer_header_->Open(header)) {
-      delete [] header;
+      delete[] header;
       return false;
     }
 
@@ -2731,7 +2574,7 @@
     writer_cues_ = chunk_writer_cues_;
     writer_header_ = chunk_writer_header_;
 
-    delete [] header;
+    delete[] header;
   }
 
   chunking_ = chunking;
@@ -2748,9 +2591,7 @@
   return true;
 }
 
-void Segment::ForceNewClusterOnNextFrame() {
-  force_new_cluster_ = true;
-}
+void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
 
 Track* Segment::GetTrackByNumber(uint64 track_number) const {
   return tracks_.GetTrackByNumber(track_number);
@@ -2775,7 +2616,7 @@
   if (SerializeInt(writer_header_, kEbmlUnknownValue, 8))
     return false;
 
-  payload_pos_ =  writer_header_->Position();
+  payload_pos_ = writer_header_->Position();
 
   if (mode_ == kFile && writer_header_->Seekable()) {
     // Set the duration > 0.0 so SegmentInfo will write out the duration. When
@@ -2819,8 +2660,7 @@
 // Here we are testing whether to create a new cluster, given a frame
 // having time frame_timestamp_ns.
 //
-int Segment::TestFrame(uint64 track_number,
-                       uint64 frame_timestamp_ns,
+int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
                        bool is_key) const {
   if (force_new_cluster_)
     return 1;
@@ -2850,7 +2690,7 @@
   // so this indicates a bug somewhere in our algorithm.
 
   if (frame_timecode < last_cluster_timecode)  // should never happen
-    return -1;  // error
+    return -1;
 
   // If the frame has a timestamp significantly larger than the last
   // cluster (in Matroska, cluster-relative timestamps are serialized
@@ -2900,7 +2740,7 @@
     const int32 new_capacity =
         (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
     Cluster** const clusters =
-        new (std::nothrow) Cluster*[new_capacity];  // NOLINT
+        new (std::nothrow) Cluster* [new_capacity];  // NOLINT
     if (!clusters)
       return false;
 
@@ -2908,7 +2748,7 @@
       clusters[i] = cluster_list_[i];
     }
 
-    delete [] cluster_list_;
+    delete[] cluster_list_;
 
     cluster_list_ = clusters;
     cluster_list_capacity_ = new_capacity;
@@ -2968,8 +2808,7 @@
 }
 
 bool Segment::DoNewClusterProcessing(uint64 track_number,
-                                     uint64 frame_timestamp_ns,
-                                     bool is_key) {
+                                     uint64 frame_timestamp_ns, bool is_key) {
   for (;;) {
     // Based on the characteristics of the current frame and current
     // cluster, decide whether to create a new cluster.
@@ -2977,12 +2816,12 @@
     if (result < 0)  // error
       return false;
 
-  // Always set force_new_cluster_ to false after TestFrame.
-  force_new_cluster_ = false;
+    // Always set force_new_cluster_ to false after TestFrame.
+    force_new_cluster_ = false;
 
-  // A non-zero result means create a new cluster.
-  if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
-    return false;
+    // A non-zero result means create a new cluster.
+    if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+      return false;
 
     // Write queued (audio) frames.
     const int frame_count = WriteFramesAll();
@@ -3051,14 +2890,14 @@
     return false;
 
 #ifdef _MSC_VER
-  strcpy_s(str, length-strlen(ext_chk), chunking_base_name_);
+  strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
   strcat_s(str, length, ext_chk);
 #else
   strcpy(str, chunking_base_name_);
   strcat(str, ext_chk);
 #endif
 
-  delete [] *name;
+  delete[] * name;
   *name = str;
 
   return true;
@@ -3093,7 +2932,7 @@
     if (new_capacity < 1)
       return false;
 
-    Frame** const frames = new (std::nothrow) Frame*[new_capacity];  // NOLINT
+    Frame** const frames = new (std::nothrow) Frame* [new_capacity];  // NOLINT
     if (!frames)
       return false;
 
@@ -3101,7 +2940,7 @@
       frames[i] = frames_[i];
     }
 
-    delete [] frames_;
+    delete[] frames_;
     frames_ = frames;
     frames_capacity_ = new_capacity;
   }
@@ -3118,7 +2957,7 @@
   if (cluster_list_size_ < 1)
     return -1;
 
-  Cluster* const cluster = cluster_list_[cluster_list_size_-1];
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
 
   if (!cluster)
     return -1;
@@ -3131,19 +2970,14 @@
     const uint64 frame_timecode = frame_timestamp / timecode_scale;
 
     if (frame->discard_padding() > 0) {
-      if (!cluster->AddFrameWithDiscardPadding(frame->frame(),
-                                               frame->length(),
-                                               frame->discard_padding(),
-                                               frame->track_number(),
-                                               frame_timecode,
-                                               frame->is_key())) {
+      if (!cluster->AddFrameWithDiscardPadding(
+              frame->frame(), frame->length(), frame->discard_padding(),
+              frame->track_number(), frame_timecode, frame->is_key())) {
         return -1;
       }
     } else {
-      if (!cluster->AddFrame(frame->frame(),
-                             frame->length(),
-                             frame->track_number(),
-                             frame_timecode,
+      if (!cluster->AddFrame(frame->frame(), frame->length(),
+                             frame->track_number(), frame_timecode,
                              frame->is_key())) {
         return -1;
       }
@@ -3175,7 +3009,7 @@
     if (!frames_)
       return false;
 
-    Cluster* const cluster = cluster_list_[cluster_list_size_-1];
+    Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
     if (!cluster)
       return false;
 
@@ -3190,25 +3024,21 @@
       if (frame_curr->timestamp() > timestamp)
         break;
 
-      const Frame* const frame_prev = frames_[i-1];
+      const Frame* const frame_prev = frames_[i - 1];
       const uint64 frame_timestamp = frame_prev->timestamp();
       const uint64 frame_timecode = frame_timestamp / timecode_scale;
       const int64 discard_padding = frame_prev->discard_padding();
 
       if (discard_padding > 0) {
-        if (!cluster->AddFrameWithDiscardPadding(frame_prev->frame(),
-                                                 frame_prev->length(),
-                                                 discard_padding,
-                                                 frame_prev->track_number(),
-                                                 frame_timecode,
-                                                 frame_prev->is_key())) {
+        if (!cluster->AddFrameWithDiscardPadding(
+                frame_prev->frame(), frame_prev->length(), discard_padding,
+                frame_prev->track_number(), frame_timecode,
+                frame_prev->is_key())) {
           return false;
         }
       } else {
-        if (!cluster->AddFrame(frame_prev->frame(),
-                               frame_prev->length(),
-                               frame_prev->track_number(),
-                               frame_timecode,
+        if (!cluster->AddFrame(frame_prev->frame(), frame_prev->length(),
+                               frame_prev->track_number(), frame_timecode,
                                frame_prev->is_key())) {
           return false;
         }
@@ -3232,7 +3062,7 @@
 
       const int32 new_frames_size = frames_size_ - shift_left;
       for (int32 i = 0; i < new_frames_size; ++i) {
-        frames_[i] = frames_[i+shift_left];
+        frames_[i] = frames_[i + shift_left];
       }
 
       frames_size_ = new_frames_size;
diff --git a/third_party/libwebm/mkvmuxer.hpp b/third_party/libwebm/mkvmuxer.hpp
index 63a315e..1c1c310 100644
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer.hpp
@@ -15,7 +15,7 @@
 // http://www.webmproject.org/code/specs/container/.
 
 namespace mkvparser {
-  class IMkvReader;
+class IMkvReader;
 }  // end namespace
 
 namespace mkvmuxer {
@@ -60,8 +60,8 @@
 bool WriteEbmlHeader(IMkvWriter* writer);
 
 // Copies in Chunk from source to destination between the given byte positions
-bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst,
-                 int64 start, int64 size);
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64 start,
+                 int64 size);
 
 ///////////////////////////////////////////////////////////////
 // Class to hold data the will be written to a block.
@@ -74,8 +74,7 @@
   bool Init(const uint8* frame, uint64 length);
 
   // Copies |additional| data into |additional_|. Returns true on success.
-  bool AddAdditionalData(const uint8* additional, uint64 length,
-                         uint64 add_id);
+  bool AddAdditionalData(const uint8* additional, uint64 length, uint64 add_id);
 
   uint64 add_id() const { return add_id_; }
   const uint8* additional() const { return additional_; }
@@ -223,9 +222,7 @@
 // ContentEncAESSettings element
 class ContentEncAESSettings {
  public:
-  enum {
-    kCTR = 1
-  };
+  enum { kCTR = 1 };
 
   ContentEncAESSettings();
   ~ContentEncAESSettings() {}
@@ -353,6 +350,10 @@
     seek_pre_roll_ = seek_pre_roll;
   }
   uint64 seek_pre_roll() const { return seek_pre_roll_; }
+  void set_default_duration(uint64 default_duration) {
+    default_duration_ = default_duration;
+  }
+  uint64 default_duration() const { return default_duration_; }
 
   uint64 codec_private_length() const { return codec_private_length_; }
   uint32 content_encoding_entries_size() const {
@@ -360,7 +361,7 @@
   }
 
  private:
-  // Track element names
+  // Track element names.
   char* codec_id_;
   uint8* codec_private_;
   char* language_;
@@ -371,6 +372,7 @@
   uint64 uid_;
   uint64 codec_delay_;
   uint64 seek_pre_roll_;
+  uint64 default_duration_;
 
   // Size of the CodecPrivate data in bytes.
   uint64 codec_private_length_;
@@ -391,16 +393,13 @@
   // Supported modes for stereo 3D.
   enum StereoMode {
     kMono = 0,
-    kSideBySideLeftIsFirst  = 1,
-    kTopBottomRightIsFirst  = 2,
-    kTopBottomLeftIsFirst   = 3,
+    kSideBySideLeftIsFirst = 1,
+    kTopBottomRightIsFirst = 2,
+    kTopBottomLeftIsFirst = 3,
     kSideBySideRightIsFirst = 11
   };
 
-  enum AlphaMode {
-    kNoAlpha = 0,
-    kAlpha  = 1
-  };
+  enum AlphaMode { kNoAlpha = 0, kAlpha = 1 };
 
   // The |seed| parameter is used to synthesize a UID for the track.
   explicit VideoTrack(unsigned int* seed);
@@ -484,10 +483,7 @@
 class Tracks {
  public:
   // Audio and video type defined by the Matroska specs.
-  enum {
-    kVideo = 0x1,
-    kAudio = 0x2
-  };
+  enum { kVideo = 0x1, kAudio = 0x2 };
   // Opus, Vorbis, VP8, and VP9 codec ids defined by the Matroska specs.
   static const char kOpusCodecId[];
   static const char kVorbisCodecId[];
@@ -544,8 +540,7 @@
 
   // Converts the nanosecond start and stop times of this chapter to
   // their corresponding timecode values, and stores them that way.
-  void set_time(const Segment& segment,
-                uint64 start_time_ns,
+  void set_time(const Segment& segment, uint64 start_time_ns,
                 uint64 end_time_ns);
 
   // Sets the uid for this chapter. Primarily used to enable
@@ -568,9 +563,7 @@
   //  http://www.iana.org/domains/root/db/
   //
   // The function returns false if the string could not be allocated.
-  bool add_string(const char* title,
-                  const char* language,
-                  const char* country);
+  bool add_string(const char* title, const char* language, const char* country);
 
  private:
   friend class Chapters;
@@ -724,9 +717,7 @@
   //   timecode:     Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* frame,
-                uint64 length,
-                uint64 track_number,
+  bool AddFrame(const uint8* frame, uint64 length, uint64 track_number,
                 uint64 timecode,  // timecode units (absolute)
                 bool is_key);
 
@@ -743,14 +734,10 @@
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* frame,
-                              uint64 length,
-                              const uint8* additional,
-                              uint64 additional_length,
-                              uint64 add_id,
-                              uint64 track_number,
-                              uint64 abs_timecode,
-                              bool is_key);
+  bool AddFrameWithAdditional(const uint8* frame, uint64 length,
+                              const uint8* additional, uint64 additional_length,
+                              uint64 add_id, uint64 track_number,
+                              uint64 abs_timecode, bool is_key);
 
   // Adds a frame to be output in the file. The frame is written out through
   // |writer_| if successful. Returns true on success.
@@ -763,12 +750,9 @@
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* frame,
-                                  uint64 length,
-                                  int64 discard_padding,
-                                  uint64 track_number,
-                                  uint64 abs_timecode,
-                                  bool is_key);
+  bool AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+                                  int64 discard_padding, uint64 track_number,
+                                  uint64 abs_timecode, bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
@@ -784,11 +768,8 @@
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* frame,
-                   uint64 length,
-                   uint64 track_number,
-                   uint64 timecode,  // timecode units (absolute)
-                   uint64 duration);  // timecode units
+  bool AddMetadata(const uint8* frame, uint64 length, uint64 track_number,
+                   uint64 timecode, uint64 duration);
 
   // Increments the size of the cluster's data in bytes.
   void AddPayloadSize(uint64 size);
@@ -809,34 +790,26 @@
  private:
   //  Signature that matches either of WriteSimpleBlock or WriteMetadataBlock
   //  in the muxer utilities package.
-  typedef uint64 (*WriteBlock)(IMkvWriter* writer,
-                               const uint8* data,
-                               uint64 length,
-                               uint64 track_number,
-                               int64 timecode,
-                               uint64 generic_arg);
+  typedef uint64 (*WriteBlock)(IMkvWriter* writer, const uint8* data,
+                               uint64 length, uint64 track_number,
+                               int64 timecode, uint64 generic_arg);
 
   //  Signature that matches WriteBlockWithAdditional
   //  in the muxer utilities package.
-  typedef uint64 (*WriteBlockAdditional)(IMkvWriter* writer,
-                                         const uint8* data,
-                                         uint64 length,
-                                         const uint8* additional,
+  typedef uint64 (*WriteBlockAdditional)(IMkvWriter* writer, const uint8* data,
+                                         uint64 length, const uint8* additional,
                                          uint64 add_id,
                                          uint64 additional_length,
-                                         uint64 track_number,
-                                         int64 timecode,
+                                         uint64 track_number, int64 timecode,
                                          uint64 is_key);
 
   //  Signature that matches WriteBlockWithDiscardPadding
   //  in the muxer utilities package.
   typedef uint64 (*WriteBlockDiscardPadding)(IMkvWriter* writer,
-                                             const uint8* data,
-                                             uint64 length,
+                                             const uint8* data, uint64 length,
                                              int64 discard_padding,
                                              uint64 track_number,
-                                             int64 timecode,
-                                             uint64 is_key);
+                                             int64 timecode, uint64 is_key);
 
   // Utility method that confirms that blocks can still be added, and that the
   // cluster header has been written. Used by |DoWriteBlock*|. Returns true
@@ -858,27 +831,20 @@
   int64 GetRelativeTimecode(int64 abs_timecode) const;
 
   //  Used to implement AddFrame and AddMetadata.
-  bool DoWriteBlock(const uint8* frame,
-                    uint64 length,
-                    uint64 track_number,
-                    uint64 absolute_timecode,
-                    uint64 generic_arg,
+  bool DoWriteBlock(const uint8* frame, uint64 length, uint64 track_number,
+                    uint64 absolute_timecode, uint64 generic_arg,
                     WriteBlock write_block);
 
   // Used to implement AddFrameWithAdditional
-  bool DoWriteBlockWithAdditional(const uint8* frame,
-                                  uint64 length,
+  bool DoWriteBlockWithAdditional(const uint8* frame, uint64 length,
                                   const uint8* additional,
-                                  uint64 additional_length,
-                                  uint64 add_id,
-                                  uint64 track_number,
-                                  uint64 absolute_timecode,
+                                  uint64 additional_length, uint64 add_id,
+                                  uint64 track_number, uint64 absolute_timecode,
                                   uint64 generic_arg,
                                   WriteBlockAdditional write_block);
 
   // Used to implement AddFrameWithDiscardPadding
-  bool DoWriteBlockWithDiscardPadding(const uint8* frame,
-                                      uint64 length,
+  bool DoWriteBlockWithDiscardPadding(const uint8* frame, uint64 length,
                                       int64 discard_padding,
                                       uint64 track_number,
                                       uint64 absolute_timecode,
@@ -993,6 +959,8 @@
   uint64 timecode_scale() const { return timecode_scale_; }
   void set_writing_app(const char* app);
   const char* writing_app() const { return writing_app_; }
+  void set_date_utc(int64 date_utc) { date_utc_ = date_utc; }
+  int64 date_utc() const { return date_utc_; }
 
  private:
   // Segment Information element names.
@@ -1004,6 +972,8 @@
   uint64 timecode_scale_;
   // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
   char* writing_app_;
+  // LLONG_MIN when DateUTC is not set.
+  int64 date_utc_;
 
   // The file position of the duration element.
   int64 duration_pos_;
@@ -1019,10 +989,7 @@
 //  |Init| must be called before any other method in this class.
 class Segment {
  public:
-  enum Mode {
-    kLive = 0x1,
-    kFile = 0x2
-  };
+  enum Mode { kLive = 0x1, kFile = 0x2 };
 
   enum CuesPosition {
     kAfterClusters = 0x0,  // Position Cues after Clusters - Default
@@ -1070,11 +1037,8 @@
   //                 functions.
   //   timestamp:    Timestamp of the frame in nanoseconds from 0.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* frame,
-                uint64 length,
-                uint64 track_number,
-                uint64 timestamp_ns,
-                bool is_key);
+  bool AddFrame(const uint8* frame, uint64 length, uint64 track_number,
+                uint64 timestamp_ns, bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
@@ -1090,11 +1054,8 @@
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* frame,
-                   uint64 length,
-                   uint64 track_number,
-                   uint64 timestamp_ns,
-                   uint64 duration_ns);
+  bool AddMetadata(const uint8* frame, uint64 length, uint64 track_number,
+                   uint64 timestamp_ns, uint64 duration_ns);
 
   // Writes a frame with additional data to the output medium; returns true on
   // success.
@@ -1109,14 +1070,10 @@
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* frame,
-                              uint64 length,
-                              const uint8* additional,
-                              uint64 additional_length,
-                              uint64 add_id,
-                              uint64 track_number,
-                              uint64 timestamp,
-                              bool is_key);
+  bool AddFrameWithAdditional(const uint8* frame, uint64 length,
+                              const uint8* additional, uint64 additional_length,
+                              uint64 add_id, uint64 track_number,
+                              uint64 timestamp, bool is_key);
 
   // Writes a frame with DiscardPadding to the output medium; returns true on
   // success.
@@ -1129,12 +1086,9 @@
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* frame,
-                                  uint64 length,
-                                  int64 discard_padding,
-                                  uint64 track_number,
-                                  uint64 timestamp,
-                                  bool is_key);
+  bool AddFrameWithDiscardPadding(const uint8* frame, uint64 length,
+                                  int64 discard_padding, uint64 track_number,
+                                  uint64 timestamp, bool is_key);
 
   // Writes a Frame to the output medium. Chooses the correct way of writing
   // the frame (Block vs SimpleBlock) based on the parameters passed.
@@ -1268,7 +1222,6 @@
   // was necessary but creation was not successful.
   bool DoNewClusterProcessing(uint64 track_num, uint64 timestamp_ns, bool key);
 
-
   // Adjusts Cue Point values (to place Cues before Clusters) so that they
   // reflect the correct offsets.
   void MoveCuesBeforeClusters();
@@ -1398,6 +1351,6 @@
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
 };
 
-}  //end namespace mkvmuxer
+}  // end namespace mkvmuxer
 
-#endif //MKVMUXER_HPP
+#endif  // MKVMUXER_HPP
diff --git a/third_party/libwebm/mkvmuxertypes.hpp b/third_party/libwebm/mkvmuxertypes.hpp
index 2c66fd2..d0fc9fe 100644
--- a/third_party/libwebm/mkvmuxertypes.hpp
+++ b/third_party/libwebm/mkvmuxertypes.hpp
@@ -1,30 +1,30 @@
-// Copyright (c) 2012 The WebM project authors. All Rights Reserved.

-//

-// Use of this source code is governed by a BSD-style license

-// that can be found in the LICENSE file in the root of the source

-// tree. An additional intellectual property rights grant can be found

-// in the file PATENTS.  All contributing project authors may

-// be found in the AUTHORS file in the root of the source tree.

-

-#ifndef MKVMUXERTYPES_HPP

-#define MKVMUXERTYPES_HPP

-

-// Copied from Chromium basictypes.h

-// A macro to disallow the copy constructor and operator= functions

-// This should be used in the private: declarations for a class

-#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \

-  TypeName(const TypeName&);               \

-  void operator=(const TypeName&)

-

-namespace mkvmuxer {

-

-typedef unsigned char      uint8;

-typedef short              int16;

-typedef int                int32;

-typedef unsigned int       uint32;

-typedef long long          int64;

-typedef unsigned long long uint64;

-

-}  //end namespace mkvmuxer

-

-#endif // MKVMUXERTYPES_HPP

+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXERTYPES_HPP
+#define MKVMUXERTYPES_HPP
+
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                       \
+  void operator=(const TypeName&)
+
+namespace mkvmuxer {
+
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+}  // end namespace mkvmuxer
+
+#endif  // MKVMUXERTYPES_HPP
diff --git a/third_party/libwebm/mkvmuxerutil.cpp b/third_party/libwebm/mkvmuxerutil.cpp
index 18060e9..3fb9bc9 100644
--- a/third_party/libwebm/mkvmuxerutil.cpp
+++ b/third_party/libwebm/mkvmuxerutil.cpp
@@ -29,6 +29,13 @@
 
 namespace mkvmuxer {
 
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+}  // namespace
+
 int32 GetCodedUIntSize(uint64 value) {
   if (value < 0x000000000000007FULL)
     return 1;
@@ -92,7 +99,7 @@
   return ebml_size;
 }
 
-uint64 EbmlElementSize(uint64 type, float /* value */ ) {
+uint64 EbmlElementSize(uint64 type, float /* value */) {
   // Size of EBML ID
   uint64 ebml_size = GetUIntSize(type);
 
@@ -137,6 +144,19 @@
   return ebml_size;
 }
 
+uint64 EbmlDateElementSize(uint64 type, int64 value) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += kDateElementSize;
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
 int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
   if (!writer || size < 1 || size > 8)
     return -1;
@@ -302,9 +322,7 @@
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer,
-                      uint64 type,
-                      const uint8* value,
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
                       uint64 size) {
   if (!writer || !value || size < 1)
     return false;
@@ -321,12 +339,24 @@
   return true;
 }
 
-uint64 WriteSimpleBlock(IMkvWriter* writer,
-                        const uint8* data,
-                        uint64 length,
-                        uint64 track_number,
-                        int64 timecode,
-                        uint64 is_key) {
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, kDateElementSize))
+    return false;
+
+  if (SerializeInt(writer, value, kDateElementSize))
+    return false;
+
+  return true;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
+                        uint64 track_number, int64 timecode, uint64 is_key) {
   if (!writer)
     return false;
 
@@ -372,7 +402,7 @@
     return 0;
 
   const uint64 element_size =
-    GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + length;
+      GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + length;
 
   return element_size;
 }
@@ -391,11 +421,8 @@
 //     Duration size
 //     (duration payload)
 //
-uint64 WriteMetadataBlock(IMkvWriter* writer,
-                          const uint8* data,
-                          uint64 length,
-                          uint64 track_number,
-                          int64 timecode,
+uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
+                          uint64 track_number, int64 timecode,
                           uint64 duration) {
   // We don't backtrack when writing to the stream, so we must
   // pre-compute the BlockGroup size, by summing the sizes of each
@@ -487,47 +514,37 @@
 //        1 (Denotes Alpha)
 //      BlockAdditional
 //        Data
-uint64 WriteBlockWithAdditional(IMkvWriter* writer,
-                                const uint8* data,
-                                uint64 length,
-                                const uint8* additional,
-                                uint64 additional_length,
-                                uint64 add_id,
-                                uint64 track_number,
-                                int64 timecode,
+uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
+                                uint64 length, const uint8* additional,
+                                uint64 additional_length, uint64 add_id,
+                                uint64 track_number, int64 timecode,
                                 uint64 is_key) {
   if (!data || !additional || length < 1 || additional_length < 1)
     return 0;
 
   const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size = EbmlMasterElementSize(kMkvBlock,
-                                                       block_payload_size) +
-                                 block_payload_size;
-  const uint64 block_additional_elem_size = EbmlElementSize(kMkvBlockAdditional,
-                                                            additional,
-                                                            additional_length);
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
+  const uint64 block_additional_elem_size =
+      EbmlElementSize(kMkvBlockAdditional, additional, additional_length);
   const uint64 block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, add_id);
 
-  const uint64 block_more_payload_size = block_addid_elem_size +
-                                         block_additional_elem_size;
-  const uint64 block_more_elem_size = EbmlMasterElementSize(
-                                          kMkvBlockMore,
-                                          block_more_payload_size) +
-                                      block_more_payload_size;
+  const uint64 block_more_payload_size =
+      block_addid_elem_size + block_additional_elem_size;
+  const uint64 block_more_elem_size =
+      EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
+      block_more_payload_size;
   const uint64 block_additions_payload_size = block_more_elem_size;
-  const uint64 block_additions_elem_size = EbmlMasterElementSize(
-                                               kMkvBlockAdditions,
-                                               block_additions_payload_size) +
-                                           block_additions_payload_size;
-  const uint64 block_group_payload_size = block_elem_size +
-                                          block_additions_elem_size;
-  const uint64 block_group_elem_size = EbmlMasterElementSize(
-                                           kMkvBlockGroup,
-                                           block_group_payload_size) +
-                                       block_group_payload_size;
+  const uint64 block_additions_elem_size =
+      EbmlMasterElementSize(kMkvBlockAdditions, block_additions_payload_size) +
+      block_additions_payload_size;
+  const uint64 block_group_payload_size =
+      block_elem_size + block_additions_elem_size;
+  const uint64 block_group_elem_size =
+      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
+      block_group_payload_size;
 
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup,
-                              block_group_payload_size))
+  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
     return 0;
 
   if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
@@ -558,8 +575,8 @@
   if (!WriteEbmlElement(writer, kMkvBlockAddID, add_id))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvBlockAdditional,
-                        additional, additional_length))
+  if (!WriteEbmlElement(writer, kMkvBlockAdditional, additional,
+                        additional_length))
     return 0;
 
   return block_group_elem_size;
@@ -571,31 +588,25 @@
 //  Block
 //    Data
 //  DiscardPadding
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer,
-                                    const uint8* data,
-                                    uint64 length,
-                                    int64 discard_padding,
-                                    uint64 track_number,
-                                    int64 timecode,
+uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
+                                    uint64 length, int64 discard_padding,
+                                    uint64 track_number, int64 timecode,
                                     uint64 is_key) {
   if (!data || length < 1 || discard_padding <= 0)
     return 0;
 
   const uint64 block_payload_size = 4 + length;
-  const uint64 block_elem_size = EbmlMasterElementSize(kMkvBlock,
-                                                       block_payload_size) +
-                                 block_payload_size;
-  const uint64 discard_padding_elem_size = EbmlElementSize(kMkvDiscardPadding,
-                                                           discard_padding);
-  const uint64 block_group_payload_size = block_elem_size +
-                                          discard_padding_elem_size;
-  const uint64 block_group_elem_size = EbmlMasterElementSize(
-                                           kMkvBlockGroup,
-                                           block_group_payload_size) +
-                                       block_group_payload_size;
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
+  const uint64 discard_padding_elem_size =
+      EbmlElementSize(kMkvDiscardPadding, discard_padding);
+  const uint64 block_group_payload_size =
+      block_elem_size + discard_padding_elem_size;
+  const uint64 block_group_elem_size =
+      EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
+      block_group_payload_size;
 
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup,
-                              block_group_payload_size))
+  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup, block_group_payload_size))
     return 0;
 
   if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
@@ -634,9 +645,9 @@
     return false;
 
   // Subtract one for the void ID and the coded size.
-  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size-1);
-  uint64 void_size = EbmlMasterElementSize(kMkvVoid, void_entry_size) +
-                     void_entry_size;
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size =
+      EbmlMasterElementSize(kMkvVoid, void_entry_size) + void_entry_size;
 
   if (void_size != size)
     return 0;
@@ -684,13 +695,13 @@
   for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
     uid <<= 8;
 
-    // TODO(fgalligan): Move random number generation to platform specific code.
+// TODO(fgalligan): Move random number generation to platform specific code.
 #ifdef _MSC_VER
     (void)seed;
     unsigned int random_value;
     const errno_t e = rand_s(&random_value);
     (void)e;
-    const int32 nn  = random_value;
+    const int32 nn = random_value;
 #elif __ANDROID__
     int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
diff --git a/third_party/libwebm/mkvmuxerutil.hpp b/third_party/libwebm/mkvmuxerutil.hpp
index d196ad3..a092abe 100644
--- a/third_party/libwebm/mkvmuxerutil.hpp
+++ b/third_party/libwebm/mkvmuxerutil.hpp
@@ -30,6 +30,7 @@
 uint64 EbmlElementSize(uint64 type, float value);
 uint64 EbmlElementSize(uint64 type, const char* value);
 uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type, int64 value);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |value|. |value| must not
@@ -52,10 +53,9 @@
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
 bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
-bool WriteEbmlElement(IMkvWriter* writer,
-                      uint64 type,
-                      const uint8* value,
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
                       uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
 
 // Output an Mkv Simple Block.
 // Inputs:
@@ -67,12 +67,8 @@
 //   timecode:     Relative timecode of the Block.  Only values in the
 //                  range [0, 2^15) are permitted.
 //   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteSimpleBlock(IMkvWriter* writer,
-                        const uint8* data,
-                        uint64 length,
-                        uint64 track_number,
-                        int64 timecode,
-                        uint64 is_key);
+uint64 WriteSimpleBlock(IMkvWriter* writer, const uint8* data, uint64 length,
+                        uint64 track_number, int64 timecode, uint64 is_key);
 
 // Output a metadata keyframe, using a Block Group element.
 // Inputs:
@@ -84,11 +80,8 @@
 //   timecode      Timecode of frame, relative to cluster timecode.  Only
 //                  values in the range [0, 2^15) are permitted.
 //   duration_timecode  Duration of frame, using timecode units.
-uint64 WriteMetadataBlock(IMkvWriter* writer,
-                          const uint8* data,
-                          uint64 length,
-                          uint64 track_number,
-                          int64 timecode,
+uint64 WriteMetadataBlock(IMkvWriter* writer, const uint8* data, uint64 length,
+                          uint64 track_number, int64 timecode,
                           uint64 duration_timecode);
 
 // Output an Mkv Block with BlockAdditional data.
@@ -104,14 +97,10 @@
 //   timecode:     Relative timecode of the Block.  Only values in the
 //                  range [0, 2^15) are permitted.
 //   is_key:       Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithAdditional(IMkvWriter* writer,
-                                const uint8* data,
-                                uint64 length,
-                                const uint8* additional,
-                                uint64 additional_length,
-                                uint64 add_id,
-                                uint64 track_number,
-                                int64 timecode,
+uint64 WriteBlockWithAdditional(IMkvWriter* writer, const uint8* data,
+                                uint64 length, const uint8* additional,
+                                uint64 additional_length, uint64 add_id,
+                                uint64 track_number, int64 timecode,
                                 uint64 is_key);
 
 // Output an Mkv Block with a DiscardPadding element.
@@ -125,12 +114,9 @@
 //   timecode:        Relative timecode of the Block.  Only values in the
 //                    range [0, 2^15) are permitted.
 //   is_key:          Non-zero value specifies that frame is a key frame.
-uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer,
-                                    const uint8* data,
-                                    uint64 length,
-                                    int64 discard_padding,
-                                    uint64 track_number,
-                                    int64 timecode,
+uint64 WriteBlockWithDiscardPadding(IMkvWriter* writer, const uint8* data,
+                                    uint64 length, int64 discard_padding,
+                                    uint64 track_number, int64 timecode,
                                     uint64 is_key);
 
 // Output a void element. |size| must be the entire size in bytes that will be
@@ -146,6 +132,6 @@
 // the random-number generator (see POSIX rand_r() for semantics).
 uint64 MakeUID(unsigned int* seed);
 
-}  //end namespace mkvmuxer
+}  // end namespace mkvmuxer
 
-#endif // MKVMUXERUTIL_HPP
+#endif  // MKVMUXERUTIL_HPP
diff --git a/third_party/libwebm/mkvparser.cpp b/third_party/libwebm/mkvparser.cpp
index b41456a..441f165 100644
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser.cpp
@@ -14,1383 +14,1204 @@
 
 #ifdef _MSC_VER
 // Disable MSVC warnings that suggest making code non-portable.
-#pragma warning(disable:4996)
+#pragma warning(disable : 4996)
 #endif
 
-mkvparser::IMkvReader::~IMkvReader()
-{
+mkvparser::IMkvReader::~IMkvReader() {}
+
+void mkvparser::GetVersion(int& major, int& minor, int& build, int& revision) {
+  major = 1;
+  minor = 0;
+  build = 0;
+  revision = 28;
 }
 
-void mkvparser::GetVersion(int& major, int& minor, int& build, int& revision)
-{
-    major = 1;
-    minor = 0;
-    build = 0;
-    revision = 27;
-}
+long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+  assert(pReader);
+  assert(pos >= 0);
 
-long long mkvparser::ReadUInt(IMkvReader* pReader, long long pos, long& len)
-{
-    assert(pReader);
-    assert(pos >= 0);
+  int status;
 
-    int status;
+  //#ifdef _DEBUG
+  //    long long total, available;
+  //    status = pReader->Length(&total, &available);
+  //    assert(status >= 0);
+  //    assert((total < 0) || (available <= total));
+  //    assert(pos < available);
+  //    assert((available - pos) >= 1);  //assume here max u-int len is 8
+  //#endif
 
-//#ifdef _DEBUG
-//    long long total, available;
-//    status = pReader->Length(&total, &available);
-//    assert(status >= 0);
-//    assert((total < 0) || (available <= total));
-//    assert(pos < available);
-//    assert((available - pos) >= 1);  //assume here max u-int len is 8
-//#endif
+  len = 1;
 
-    len = 1;
+  unsigned char b;
 
-    unsigned char b;
+  status = pReader->Read(pos, 1, &b);
 
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status > 0)  // interpreted as "underflow"
+    return E_BUFFER_NOT_FULL;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  //#ifdef _DEBUG
+  //    assert((available - pos) >= len);
+  //#endif
+
+  long long result = b & (~m);
+  ++pos;
+
+  for (int i = 1; i < len; ++i) {
     status = pReader->Read(pos, 1, &b);
 
-    if (status < 0)  //error or underflow
-        return status;
-
-    if (status > 0)  //interpreted as "underflow"
-        return E_BUFFER_NOT_FULL;
-
-    if (b == 0)  //we can't handle u-int values larger than 8 bytes
-        return E_FILE_FORMAT_INVALID;
-
-    unsigned char m = 0x80;
-
-    while (!(b & m))
-    {
-        m >>= 1;
-        ++len;
+    if (status < 0) {
+      len = 1;
+      return status;
     }
 
-//#ifdef _DEBUG
-//    assert((available - pos) >= len);
-//#endif
+    if (status > 0) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-    long long result = b & (~m);
+    result <<= 8;
+    result |= b;
+
     ++pos;
+  }
 
-    for (int i = 1; i < len; ++i)
-    {
-        status = pReader->Read(pos, 1, &b);
-
-        if (status < 0)
-        {
-            len = 1;
-            return status;
-        }
-
-        if (status > 0)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result <<= 8;
-        result |= b;
-
-        ++pos;
-    }
-
-    return result;
+  return result;
 }
 
-long long mkvparser::GetUIntLength(
-    IMkvReader* pReader,
-    long long pos,
-    long& len)
-{
-    assert(pReader);
-    assert(pos >= 0);
+long long mkvparser::GetUIntLength(IMkvReader* pReader, long long pos,
+                                   long& len) {
+  assert(pReader);
+  assert(pos >= 0);
 
-    long long total, available;
+  long long total, available;
 
-    int status = pReader->Length(&total, &available);
-    assert(status >= 0);
-    assert((total < 0) || (available <= total));
+  int status = pReader->Length(&total, &available);
+  assert(status >= 0);
+  assert((total < 0) || (available <= total));
 
-    len = 1;
+  len = 1;
 
-    if (pos >= available)
-        return pos;  //too few bytes available
+  if (pos >= available)
+    return pos;  // too few bytes available
 
+  unsigned char b;
+
+  status = pReader->Read(pos, 1, &b);
+
+  if (status < 0)
+    return status;
+
+  assert(status == 0);
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  return 0;  // success
+}
+
+long long mkvparser::UnserializeUInt(IMkvReader* pReader, long long pos,
+                                     long long size) {
+  assert(pReader);
+  assert(pos >= 0);
+
+  if ((size <= 0) || (size > 8))
+    return E_FILE_FORMAT_INVALID;
+
+  long long result = 0;
+
+  for (long long i = 0; i < size; ++i) {
     unsigned char b;
 
-    status = pReader->Read(pos, 1, &b);
+    const long status = pReader->Read(pos, 1, &b);
 
     if (status < 0)
-        return status;
+      return status;
 
-    assert(status == 0);
+    result <<= 8;
+    result |= b;
 
-    if (b == 0)  //we can't handle u-int values larger than 8 bytes
-        return E_FILE_FORMAT_INVALID;
+    ++pos;
+  }
 
-    unsigned char m = 0x80;
-
-    while (!(b & m))
-    {
-        m >>= 1;
-        ++len;
-    }
-
-    return 0;  //success
+  return result;
 }
 
+long mkvparser::UnserializeFloat(IMkvReader* pReader, long long pos,
+                                 long long size_, double& result) {
+  assert(pReader);
+  assert(pos >= 0);
 
-long long mkvparser::UnserializeUInt(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    assert(pReader);
-    assert(pos >= 0);
+  if ((size_ != 4) && (size_ != 8))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((size <= 0) || (size > 8))
-        return E_FILE_FORMAT_INVALID;
+  const long size = static_cast<long>(size_);
 
-    long long result = 0;
+  unsigned char buf[8];
 
-    for (long long i = 0; i < size; ++i)
-    {
-        unsigned char b;
+  const int status = pReader->Read(pos, size, buf);
 
-        const long status = pReader->Read(pos, 1, &b);
+  if (status < 0)  // error
+    return status;
 
-        if (status < 0)
-            return status;
+  if (size == 4) {
+    union {
+      float f;
+      unsigned long ff;
+    };
 
-        result <<= 8;
-        result |= b;
+    ff = 0;
 
-        ++pos;
+    for (int i = 0;;) {
+      ff |= buf[i];
+
+      if (++i >= 4)
+        break;
+
+      ff <<= 8;
     }
 
-    return result;
+    result = f;
+  } else {
+    assert(size == 8);
+
+    union {
+      double d;
+      unsigned long long dd;
+    };
+
+    dd = 0;
+
+    for (int i = 0;;) {
+      dd |= buf[i];
+
+      if (++i >= 8)
+        break;
+
+      dd <<= 8;
+    }
+
+    result = d;
+  }
+
+  return 0;
 }
 
+long mkvparser::UnserializeInt(IMkvReader* pReader, long long pos, long size,
+                               long long& result) {
+  assert(pReader);
+  assert(pos >= 0);
+  assert(size > 0);
+  assert(size <= 8);
 
-long mkvparser::UnserializeFloat(
-    IMkvReader* pReader,
-    long long pos,
-    long long size_,
-    double& result)
-{
-    assert(pReader);
-    assert(pos >= 0);
+  {
+    signed char b;
 
-    if ((size_ != 4) && (size_ != 8))
-        return E_FILE_FORMAT_INVALID;
+    const long status = pReader->Read(pos, 1, (unsigned char*)&b);
 
-    const long size = static_cast<long>(size_);
+    if (status < 0)
+      return status;
 
-    unsigned char buf[8];
+    result = b;
 
-    const int status = pReader->Read(pos, size, buf);
+    ++pos;
+  }
 
-    if (status < 0)  //error
-        return status;
+  for (long i = 1; i < size; ++i) {
+    unsigned char b;
 
-    if (size == 4)
-    {
-        union
-        {
-            float f;
-            unsigned long ff;
-        };
+    const long status = pReader->Read(pos, 1, &b);
 
-        ff = 0;
+    if (status < 0)
+      return status;
 
-        for (int i = 0;;)
-        {
-            ff |= buf[i];
+    result <<= 8;
+    result |= b;
 
-            if (++i >= 4)
-                break;
+    ++pos;
+  }
 
-            ff <<= 8;
-        }
-
-        result = f;
-    }
-    else
-    {
-        assert(size == 8);
-
-        union
-        {
-            double d;
-            unsigned long long dd;
-        };
-
-        dd = 0;
-
-        for (int i = 0;;)
-        {
-            dd |= buf[i];
-
-            if (++i >= 8)
-                break;
-
-            dd <<= 8;
-        }
-
-        result = d;
-    }
-
-    return 0;
+  return 0;  // success
 }
 
+long mkvparser::UnserializeString(IMkvReader* pReader, long long pos,
+                                  long long size_, char*& str) {
+  delete[] str;
+  str = NULL;
 
-long mkvparser::UnserializeInt(
-    IMkvReader* pReader,
-    long long pos,
-    long size,
-    long long& result)
-{
-    assert(pReader);
-    assert(pos >= 0);
-    assert(size > 0);
-    assert(size <= 8);
+  if (size_ >= LONG_MAX)  // we need (size+1) chars
+    return E_FILE_FORMAT_INVALID;
 
-    {
-        signed char b;
+  const long size = static_cast<long>(size_);
 
-        const long status = pReader->Read(pos, 1, (unsigned char*)&b);
+  str = new (std::nothrow) char[size + 1];
 
-        if (status < 0)
-            return status;
+  if (str == NULL)
+    return -1;
 
-        result = b;
+  unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
 
-        ++pos;
-    }
+  const long status = pReader->Read(pos, size, buf);
 
-    for (long i = 1; i < size; ++i)
-    {
-        unsigned char b;
-
-        const long status = pReader->Read(pos, 1, &b);
-
-        if (status < 0)
-            return status;
-
-        result <<= 8;
-        result |= b;
-
-        ++pos;
-    }
-
-    return 0;  //success
-}
-
-
-long mkvparser::UnserializeString(
-    IMkvReader* pReader,
-    long long pos,
-    long long size_,
-    char*& str)
-{
+  if (status) {
     delete[] str;
     str = NULL;
 
-    if (size_ >= LONG_MAX)  //we need (size+1) chars
-        return E_FILE_FORMAT_INVALID;
+    return status;
+  }
 
-    const long size = static_cast<long>(size_);
+  str[size] = '\0';
 
-    str = new (std::nothrow) char[size+1];
-
-    if (str == NULL)
-        return -1;
-
-    unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
-
-    const long status = pReader->Read(pos, size, buf);
-
-    if (status)
-    {
-        delete[] str;
-        str = NULL;
-
-        return status;
-    }
-
-    str[size] = '\0';
-
-    return 0;  //success
+  return 0;  // success
 }
 
+long mkvparser::ParseElementHeader(IMkvReader* pReader, long long& pos,
+                                   long long stop, long long& id,
+                                   long long& size) {
+  if ((stop >= 0) && (pos >= stop))
+    return E_FILE_FORMAT_INVALID;
 
-long mkvparser::ParseElementHeader(
-    IMkvReader* pReader,
-    long long& pos,
-    long long stop,
-    long long& id,
-    long long& size)
-{
-    if ((stop >= 0) && (pos >= stop))
-        return E_FILE_FORMAT_INVALID;
+  long len;
 
-    long len;
+  id = ReadUInt(pReader, pos, len);
 
-    id = ReadUInt(pReader, pos, len);
+  if (id < 0)
+    return E_FILE_FORMAT_INVALID;
 
-    if (id < 0)
-        return E_FILE_FORMAT_INVALID;
+  pos += len;  // consume id
 
-    pos += len;  //consume id
+  if ((stop >= 0) && (pos >= stop))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((stop >= 0) && (pos >= stop))
-        return E_FILE_FORMAT_INVALID;
+  size = ReadUInt(pReader, pos, len);
 
-    size = ReadUInt(pReader, pos, len);
+  if (size < 0)
+    return E_FILE_FORMAT_INVALID;
 
-    if (size < 0)
-        return E_FILE_FORMAT_INVALID;
+  pos += len;  // consume length of size
 
-    pos += len;  //consume length of size
+  // pos now designates payload
 
-    //pos now designates payload
+  if ((stop >= 0) && ((pos + size) > stop))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((stop >= 0) && ((pos + size) > stop))
-        return E_FILE_FORMAT_INVALID;
-
-    return 0;  //success
+  return 0;  // success
 }
 
+bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
+                      long long& val) {
+  assert(pReader);
+  assert(pos >= 0);
 
-bool mkvparser::Match(
-    IMkvReader* pReader,
-    long long& pos,
-    unsigned long id_,
-    long long& val)
-{
-    assert(pReader);
-    assert(pos >= 0);
+  long long total, available;
 
-    long long total, available;
+  const long status = pReader->Length(&total, &available);
+  assert(status >= 0);
+  assert((total < 0) || (available <= total));
+  if (status < 0)
+    return false;
 
-    const long status = pReader->Length(&total, &available);
-    assert(status >= 0);
-    assert((total < 0) || (available <= total));
-    if (status < 0)
-        return false;
+  long len;
 
-    long len;
+  const long long id = ReadUInt(pReader, pos, len);
+  assert(id >= 0);
+  assert(len > 0);
+  assert(len <= 8);
+  assert((pos + len) <= available);
 
-    const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);
-    assert(len > 0);
-    assert(len <= 8);
-    assert((pos + len) <= available);
+  if ((unsigned long)id != id_)
+    return false;
 
-    if ((unsigned long)id != id_)
-        return false;
+  pos += len;  // consume id
 
-    pos += len;  //consume id
+  const long long size = ReadUInt(pReader, pos, len);
+  assert(size >= 0);
+  assert(size <= 8);
+  assert(len > 0);
+  assert(len <= 8);
+  assert((pos + len) <= available);
 
-    const long long size = ReadUInt(pReader, pos, len);
-    assert(size >= 0);
-    assert(size <= 8);
-    assert(len > 0);
-    assert(len <= 8);
-    assert((pos + len) <= available);
+  pos += len;  // consume length of size of payload
 
-    pos += len;  //consume length of size of payload
+  val = UnserializeUInt(pReader, pos, size);
+  assert(val >= 0);
 
-    val = UnserializeUInt(pReader, pos, size);
-    assert(val >= 0);
+  pos += size;  // consume size of payload
 
-    pos += size;  //consume size of payload
-
-    return true;
+  return true;
 }
 
-bool mkvparser::Match(
-    IMkvReader* pReader,
-    long long& pos,
-    unsigned long id_,
-    unsigned char*& buf,
-    size_t& buflen)
-{
-    assert(pReader);
-    assert(pos >= 0);
+bool mkvparser::Match(IMkvReader* pReader, long long& pos, unsigned long id_,
+                      unsigned char*& buf, size_t& buflen) {
+  assert(pReader);
+  assert(pos >= 0);
 
-    long long total, available;
+  long long total, available;
 
-    long status = pReader->Length(&total, &available);
-    assert(status >= 0);
-    assert((total < 0) || (available <= total));
-    if (status < 0)
-        return false;
+  long status = pReader->Length(&total, &available);
+  assert(status >= 0);
+  assert((total < 0) || (available <= total));
+  if (status < 0)
+    return false;
 
-    long len;
-    const long long id = ReadUInt(pReader, pos, len);
-    assert(id >= 0);
-    assert(len > 0);
-    assert(len <= 8);
-    assert((pos + len) <= available);
+  long len;
+  const long long id = ReadUInt(pReader, pos, len);
+  assert(id >= 0);
+  assert(len > 0);
+  assert(len <= 8);
+  assert((pos + len) <= available);
 
-    if ((unsigned long)id != id_)
-        return false;
+  if ((unsigned long)id != id_)
+    return false;
 
-    pos += len;  //consume id
+  pos += len;  // consume id
 
-    const long long size_ = ReadUInt(pReader, pos, len);
-    assert(size_ >= 0);
-    assert(len > 0);
-    assert(len <= 8);
-    assert((pos + len) <= available);
+  const long long size_ = ReadUInt(pReader, pos, len);
+  assert(size_ >= 0);
+  assert(len > 0);
+  assert(len <= 8);
+  assert((pos + len) <= available);
 
-    pos += len;  //consume length of size of payload
-    assert((pos + size_) <= available);
+  pos += len;  // consume length of size of payload
+  assert((pos + size_) <= available);
 
-    const long buflen_ = static_cast<long>(size_);
+  const long buflen_ = static_cast<long>(size_);
 
-    buf = new (std::nothrow) unsigned char[buflen_];
-    assert(buf);  //TODO
+  buf = new (std::nothrow) unsigned char[buflen_];
+  assert(buf);  // TODO
 
-    status = pReader->Read(pos, buflen_, buf);
-    assert(status == 0);  //TODO
+  status = pReader->Read(pos, buflen_, buf);
+  assert(status == 0);  // TODO
 
-    buflen = buflen_;
+  buflen = buflen_;
 
-    pos += size_;  //consume size of payload
-    return true;
+  pos += size_;  // consume size of payload
+  return true;
 }
 
+namespace mkvparser {
 
-namespace mkvparser
-{
+EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
 
-EBMLHeader::EBMLHeader() :
-    m_docType(NULL)
-{
-    Init();
-}
+EBMLHeader::~EBMLHeader() { delete[] m_docType; }
 
-EBMLHeader::~EBMLHeader()
-{
+void EBMLHeader::Init() {
+  m_version = 1;
+  m_readVersion = 1;
+  m_maxIdLength = 4;
+  m_maxSizeLength = 8;
+
+  if (m_docType) {
     delete[] m_docType;
+    m_docType = NULL;
+  }
+
+  m_docTypeVersion = 1;
+  m_docTypeReadVersion = 1;
 }
 
-void EBMLHeader::Init()
-{
-    m_version = 1;
-    m_readVersion = 1;
-    m_maxIdLength = 4;
-    m_maxSizeLength = 8;
+long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
+  assert(pReader);
 
-    if (m_docType)
-    {
-        delete[] m_docType;
-        m_docType = NULL;
-    }
+  long long total, available;
 
-    m_docTypeVersion = 1;
-    m_docTypeReadVersion = 1;
-}
+  long status = pReader->Length(&total, &available);
 
-long long EBMLHeader::Parse(
-    IMkvReader* pReader,
-    long long& pos)
-{
-    assert(pReader);
+  if (status < 0)  // error
+    return status;
 
-    long long total, available;
+  pos = 0;
+  long long end = (available >= 1024) ? 1024 : available;
 
-    long status = pReader->Length(&total, &available);
+  for (;;) {
+    unsigned char b = 0;
 
-    if (status < 0)  //error
+    while (pos < end) {
+      status = pReader->Read(pos, 1, &b);
+
+      if (status < 0)  // error
         return status;
 
-    pos = 0;
-    long long end = (available >= 1024) ? 1024 : available;
+      if (b == 0x1A)
+        break;
 
-    for (;;)
-    {
-        unsigned char b = 0;
-
-        while (pos < end)
-        {
-            status = pReader->Read(pos, 1, &b);
-
-            if (status < 0)  //error
-                return status;
-
-            if (b == 0x1A)
-                break;
-
-            ++pos;
-        }
-
-        if (b != 0x1A)
-        {
-            if (pos >= 1024)
-                return E_FILE_FORMAT_INVALID;  //don't bother looking anymore
-
-            if ((total >= 0) && ((total - available) < 5))
-                return E_FILE_FORMAT_INVALID;
-
-            return available + 5;  //5 = 4-byte ID + 1st byte of size
-        }
-
-        if ((total >= 0) && ((total - pos) < 5))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((available - pos) < 5)
-            return pos + 5;  //try again later
-
-        long len;
-
-        const long long result = ReadUInt(pReader, pos, len);
-
-        if (result < 0)  //error
-            return result;
-
-        if (result == 0x0A45DFA3)  //EBML Header ID
-        {
-            pos += len;  //consume ID
-            break;
-        }
-
-        ++pos;  //throw away just the 0x1A byte, and try again
+      ++pos;
     }
 
-    //pos designates start of size field
+    if (b != 0x1A) {
+      if (pos >= 1024)
+        return E_FILE_FORMAT_INVALID;  // don't bother looking anymore
 
-    //get length of size field
+      if ((total >= 0) && ((total - available) < 5))
+        return E_FILE_FORMAT_INVALID;
 
+      return available + 5;  // 5 = 4-byte ID + 1st byte of size
+    }
+
+    if ((total >= 0) && ((total - pos) < 5))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((available - pos) < 5)
+      return pos + 5;  // try again later
+
+    long len;
+
+    const long long result = ReadUInt(pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result == 0x0A45DFA3) {  // EBML Header ID
+      pos += len;  // consume ID
+      break;
+    }
+
+    ++pos;  // throw away just the 0x1A byte, and try again
+  }
+
+  // pos designates start of size field
+
+  // get length of size field
+
+  long len;
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return result;
+
+  if (result > 0)  // need more data
+    return result;
+
+  assert(len > 0);
+  assert(len <= 8);
+
+  if ((total >= 0) && ((total - pos) < len))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < len)
+    return pos + len;  // try again later
+
+  // get the EBML header size
+
+  result = ReadUInt(pReader, pos, len);
+
+  if (result < 0)  // error
+    return result;
+
+  pos += len;  // consume size field
+
+  // pos now designates start of payload
+
+  if ((total >= 0) && ((total - pos) < result))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < result)
+    return pos + result;
+
+  end = pos + result;
+
+  Init();
+
+  while (pos < end) {
+    long long id, size;
+
+    status = ParseElementHeader(pReader, pos, end, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == 0x0286) {  // version
+      m_version = UnserializeUInt(pReader, pos, size);
+
+      if (m_version <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x02F7) {  // read version
+      m_readVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_readVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x02F2) {  // max id length
+      m_maxIdLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxIdLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x02F3) {  // max size length
+      m_maxSizeLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxSizeLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x0282) {  // doctype
+      if (m_docType)
+        return E_FILE_FORMAT_INVALID;
+
+      status = UnserializeString(pReader, pos, size, m_docType);
+
+      if (status)  // error
+        return status;
+    } else if (id == 0x0287) {  // doctype version
+      m_docTypeVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x0285) {  // doctype read version
+      m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeReadVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;
+  }
+
+  assert(pos == end);
+  return 0;
+}
+
+Segment::Segment(IMkvReader* pReader, long long elem_start,
+                 // long long elem_size,
+                 long long start, long long size)
+    : m_pReader(pReader),
+      m_element_start(elem_start),
+      // m_element_size(elem_size),
+      m_start(start),
+      m_size(size),
+      m_pos(start),
+      m_pUnknownSize(0),
+      m_pSeekHead(NULL),
+      m_pInfo(NULL),
+      m_pTracks(NULL),
+      m_pCues(NULL),
+      m_pChapters(NULL),
+      m_clusters(NULL),
+      m_clusterCount(0),
+      m_clusterPreloadCount(0),
+      m_clusterSize(0) {}
+
+Segment::~Segment() {
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** i = m_clusters;
+  Cluster** j = m_clusters + count;
+
+  while (i != j) {
+    Cluster* const p = *i++;
+    assert(p);
+
+    delete p;
+  }
+
+  delete[] m_clusters;
+
+  delete m_pTracks;
+  delete m_pInfo;
+  delete m_pCues;
+  delete m_pChapters;
+  delete m_pSeekHead;
+}
+
+long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
+                                  Segment*& pSegment) {
+  assert(pReader);
+  assert(pos >= 0);
+
+  pSegment = NULL;
+
+  long long total, available;
+
+  const long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (available < 0)
+    return -1;
+
+  if ((total >= 0) && (available > total))
+    return -1;
+
+  // I would assume that in practice this loop would execute
+  // exactly once, but we allow for other elements (e.g. Void)
+  // to immediately follow the EBML header.  This is fine for
+  // the source filter case (since the entire file is available),
+  // but in the splitter case over a network we should probably
+  // just give up early.  We could for example decide only to
+  // execute this loop a maximum of, say, 10 times.
+  // TODO:
+  // There is an implied "give up early" by only parsing up
+  // to the available limit.  We do do that, but only if the
+  // total file size is unknown.  We could decide to always
+  // use what's available as our limit (irrespective of whether
+  // we happen to know the total file length).  This would have
+  // as its sense "parse this much of the file before giving up",
+  // which a slightly different sense from "try to parse up to
+  // 10 EMBL elements before giving up".
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return E_FILE_FORMAT_INVALID;
+
+    // Read ID
     long len;
     long long result = GetUIntLength(pReader, pos, len);
 
-    if (result < 0)  //error
-        return result;
+    if (result)  // error, or too few available bytes
+      return result;
 
-    if (result > 0)  //need more data
-        return result;
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
 
-    assert(len > 0);
-    assert(len <= 8);
+    if ((pos + len) > available)
+      return pos + len;
 
-    if ((total >= 0) && ((total -  pos) < len))
-        return E_FILE_FORMAT_INVALID;
+    const long long idpos = pos;
+    const long long id = ReadUInt(pReader, pos, len);
 
-    if ((available - pos) < len)
-        return pos + len;  //try again later
+    if (id < 0)  // error
+      return id;
 
-    //get the EBML header size
+    pos += len;  // consume ID
 
-    result = ReadUInt(pReader, pos, len);
+    // Read Size
 
-    if (result < 0)  //error
-        return result;
+    result = GetUIntLength(pReader, pos, len);
 
-    pos += len;  //consume size field
+    if (result)  // error, or too few available bytes
+      return result;
 
-    //pos now designates start of payload
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
 
-    if ((total >= 0) && ((total - pos) < result))
-        return E_FILE_FORMAT_INVALID;
+    if ((pos + len) > available)
+      return pos + len;
 
-    if ((available - pos) < result)
-        return pos + result;
+    long long size = ReadUInt(pReader, pos, len);
 
-    end = pos + result;
+    if (size < 0)  // error
+      return size;
 
-    Init();
+    pos += len;  // consume length of size of element
 
-    while (pos < end)
-    {
-        long long id, size;
+    // Pos now points to start of payload
 
-        status = ParseElementHeader(
-                    pReader,
-                    pos,
-                    end,
-                    id,
-                    size);
+    // Handle "unknown size" for live streaming of webm files.
+    const long long unknown_size = (1LL << (7 * len)) - 1;
 
-        if (status < 0) //error
-            return status;
+    if (id == 0x08538067) {  // Segment ID
+      if (size == unknown_size)
+        size = -1;
 
-        if (size == 0)  //weird
-            return E_FILE_FORMAT_INVALID;
+      else if (total < 0)
+        size = -1;
 
-        if (id == 0x0286)  //version
-        {
-            m_version = UnserializeUInt(pReader, pos, size);
+      else if ((pos + size) > total)
+        size = -1;
 
-            if (m_version <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x02F7)  //read version
-        {
-            m_readVersion = UnserializeUInt(pReader, pos, size);
+      pSegment = new (std::nothrow) Segment(pReader, idpos,
+                                            // elem_size
+                                            pos, size);
 
-            if (m_readVersion <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x02F2)  //max id length
-        {
-            m_maxIdLength = UnserializeUInt(pReader, pos, size);
+      if (pSegment == 0)
+        return -1;  // generic error
 
-            if (m_maxIdLength <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x02F3)  //max size length
-        {
-            m_maxSizeLength = UnserializeUInt(pReader, pos, size);
-
-            if (m_maxSizeLength <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x0282)  //doctype
-        {
-            if (m_docType)
-                return E_FILE_FORMAT_INVALID;
-
-            status = UnserializeString(pReader, pos, size, m_docType);
-
-            if (status)  //error
-                return status;
-        }
-        else if (id == 0x0287)  //doctype version
-        {
-            m_docTypeVersion = UnserializeUInt(pReader, pos, size);
-
-            if (m_docTypeVersion <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x0285)  //doctype read version
-        {
-            m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
-
-            if (m_docTypeReadVersion <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-
-        pos += size;
+      return 0;  // success
     }
 
-    assert(pos == end);
-    return 0;
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + size) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    pos += size;  // consume payload
+  }
 }
 
+long long Segment::ParseHeaders() {
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+  long long total, available;
 
-Segment::Segment(
-    IMkvReader* pReader,
-    long long elem_start,
-    //long long elem_size,
-    long long start,
-    long long size) :
-    m_pReader(pReader),
-    m_element_start(elem_start),
-    //m_element_size(elem_size),
-    m_start(start),
-    m_size(size),
-    m_pos(start),
-    m_pUnknownSize(0),
-    m_pSeekHead(NULL),
-    m_pInfo(NULL),
-    m_pTracks(NULL),
-    m_pCues(NULL),
-    m_pChapters(NULL),
-    m_clusters(NULL),
-    m_clusterCount(0),
-    m_clusterPreloadCount(0),
-    m_clusterSize(0)
-{
-}
+  const int status = m_pReader->Length(&total, &available);
 
+  if (status < 0)  // error
+    return status;
 
-Segment::~Segment()
-{
-    const long count = m_clusterCount + m_clusterPreloadCount;
+  assert((total < 0) || (available <= total));
 
-    Cluster** i = m_clusters;
-    Cluster** j = m_clusters + count;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+  assert((segment_stop < 0) || (total < 0) || (segment_stop <= total));
+  assert((segment_stop < 0) || (m_pos <= segment_stop));
 
-    while (i != j)
-    {
-        Cluster* const p = *i++;
-        assert(p);
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      break;
 
-        delete p;
-    }
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      break;
 
-    delete[] m_clusters;
+    long long pos = m_pos;
+    const long long element_start = pos;
 
-    delete m_pTracks;
-    delete m_pInfo;
-    delete m_pCues;
-    delete m_pChapters;
-    delete m_pSeekHead;
-}
+    if ((pos + 1) > available)
+      return (pos + 1);
 
+    long len;
+    long long result = GetUIntLength(m_pReader, pos, len);
 
-long long Segment::CreateInstance(
-    IMkvReader* pReader,
-    long long pos,
-    Segment*& pSegment)
-{
-    assert(pReader);
-    assert(pos >= 0);
+    if (result < 0)  // error
+      return result;
 
-    pSegment = NULL;
+    if (result > 0)  // underflow (weird)
+      return (pos + 1);
 
-    long long total, available;
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-    const long status = pReader->Length(&total, &available);
+    if ((pos + len) > available)
+      return pos + len;
 
-    if (status < 0) //error
-        return status;
+    const long long idpos = pos;
+    const long long id = ReadUInt(m_pReader, idpos, len);
 
-    if (available < 0)
+    if (id < 0)  // error
+      return id;
+
+    if (id == 0x0F43B675)  // Cluster ID
+      break;
+
+    pos += len;  // consume ID
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0)  // underflow (weird)
+      return (pos + 1);
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return size;
+
+    pos += len;  // consume length of size of element
+
+    const long long element_size = size + pos - element_start;
+
+    // Pos now points to start of payload
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // We read EBML elements either in total or nothing at all.
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    if (id == 0x0549A966) {  // Segment Info ID
+      if (m_pInfo)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pInfo = new (std::nothrow)
+          SegmentInfo(this, pos, size, element_start, element_size);
+
+      if (m_pInfo == NULL)
         return -1;
 
-    if ((total >= 0) && (available > total))
+      const long status = m_pInfo->Parse();
+
+      if (status)
+        return status;
+    } else if (id == 0x0654AE6B) {  // Tracks ID
+      if (m_pTracks)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pTracks = new (std::nothrow)
+          Tracks(this, pos, size, element_start, element_size);
+
+      if (m_pTracks == NULL)
         return -1;
 
-    //I would assume that in practice this loop would execute
-    //exactly once, but we allow for other elements (e.g. Void)
-    //to immediately follow the EBML header.  This is fine for
-    //the source filter case (since the entire file is available),
-    //but in the splitter case over a network we should probably
-    //just give up early.  We could for example decide only to
-    //execute this loop a maximum of, say, 10 times.
-    //TODO:
-    //There is an implied "give up early" by only parsing up
-    //to the available limit.  We do do that, but only if the
-    //total file size is unknown.  We could decide to always
-    //use what's available as our limit (irrespective of whether
-    //we happen to know the total file length).  This would have
-    //as its sense "parse this much of the file before giving up",
-    //which a slightly different sense from "try to parse up to
-    //10 EMBL elements before giving up".
+      const long status = m_pTracks->Parse();
 
-    for (;;)
-    {
-        if ((total >= 0) && (pos >= total))
-            return E_FILE_FORMAT_INVALID;
-
-        //Read ID
-        long len;
-        long long result = GetUIntLength(pReader, pos, len);
-
-        if (result)  //error, or too few available bytes
-            return result;
-
-        if ((total >= 0) && ((pos + len) > total))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > available)
-            return pos + len;
-
-        const long long idpos = pos;
-        const long long id = ReadUInt(pReader, pos, len);
-
-        if (id < 0)  //error
-            return id;
-
-        pos += len;  //consume ID
-
-        //Read Size
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result)  //error, or too few available bytes
-            return result;
-
-        if ((total >= 0) && ((pos + len) > total))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > available)
-            return pos + len;
-
-        long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return size;
-
-        pos += len;  //consume length of size of element
-
-        //Pos now points to start of payload
-
-        //Handle "unknown size" for live streaming of webm files.
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (id == 0x08538067)  //Segment ID
-        {
-            if (size == unknown_size)
-                size = -1;
-
-            else if (total < 0)
-                size = -1;
-
-            else if ((pos + size) > total)
-                size = -1;
-
-            pSegment = new (std::nothrow) Segment(
-                                            pReader,
-                                            idpos,
-                                            //elem_size
-                                            pos,
-                                            size);
-
-            if (pSegment == 0)
-                return -1;  //generic error
-
-            return 0;    //success
-        }
-
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;
-
-        if ((total >= 0) && ((pos + size) > total))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + size) > available)
-            return pos + size;
-
-        pos += size;  //consume payload
-    }
-}
-
-
-long long Segment::ParseHeaders()
-{
-    //Outermost (level 0) segment object has been constructed,
-    //and pos designates start of payload.  We need to find the
-    //inner (level 1) elements.
-    long long total, available;
-
-    const int status = m_pReader->Length(&total, &available);
-
-    if (status < 0) //error
+      if (status)
         return status;
+    } else if (id == 0x0C53BB6B) {  // Cues ID
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
 
-    assert((total < 0) || (available <= total));
+        if (m_pCues == NULL)
+          return -1;
+      }
+    } else if (id == 0x014D9B74) {  // SeekHead ID
+      if (m_pSeekHead == NULL) {
+        m_pSeekHead = new (std::nothrow)
+            SeekHead(this, pos, size, element_start, element_size);
 
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
-    assert((segment_stop < 0) || (total < 0) || (segment_stop <= total));
-    assert((segment_stop < 0) || (m_pos <= segment_stop));
+        if (m_pSeekHead == NULL)
+          return -1;
 
-    for (;;)
-    {
-        if ((total >= 0) && (m_pos >= total))
-            break;
+        const long status = m_pSeekHead->Parse();
 
-        if ((segment_stop >= 0) && (m_pos >= segment_stop))
-            break;
+        if (status)
+          return status;
+      }
+    } else if (id == 0x0043A770) {  // Chapters ID
+      if (m_pChapters == NULL) {
+        m_pChapters = new (std::nothrow)
+            Chapters(this, pos, size, element_start, element_size);
 
-        long long pos = m_pos;
-        const long long element_start = pos;
+        if (m_pChapters == NULL)
+          return -1;
 
-        if ((pos + 1) > available)
-            return (pos + 1);
+        const long status = m_pChapters->Parse();
 
-        long len;
-        long long result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return result;
-
-        if (result > 0)  //underflow (weird)
-            return (pos + 1);
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > available)
-            return pos + len;
-
-        const long long idpos = pos;
-        const long long id = ReadUInt(m_pReader, idpos, len);
-
-        if (id < 0)  //error
-            return id;
-
-        if (id == 0x0F43B675)  //Cluster ID
-            break;
-
-        pos += len;  //consume ID
-
-        if ((pos + 1) > available)
-            return (pos + 1);
-
-        //Read Size
-        result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return result;
-
-        if (result > 0)  //underflow (weird)
-            return (pos + 1);
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > available)
-            return pos + len;
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-
-        if (size < 0)  //error
-            return size;
-
-        pos += len;  //consume length of size of element
-
-        const long long element_size = size + pos - element_start;
-
-        //Pos now points to start of payload
-
-        if ((segment_stop >= 0) && ((pos + size) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        //We read EBML elements either in total or nothing at all.
-
-        if ((pos + size) > available)
-            return pos + size;
-
-        if (id == 0x0549A966)  //Segment Info ID
-        {
-            if (m_pInfo)
-                return E_FILE_FORMAT_INVALID;
-
-            m_pInfo = new (std::nothrow) SegmentInfo(
-                                          this,
-                                          pos,
-                                          size,
-                                          element_start,
-                                          element_size);
-
-            if (m_pInfo == NULL)
-                return -1;
-
-            const long status = m_pInfo->Parse();
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x0654AE6B)  //Tracks ID
-        {
-            if (m_pTracks)
-                return E_FILE_FORMAT_INVALID;
-
-            m_pTracks = new (std::nothrow) Tracks(this,
-                                                  pos,
-                                                  size,
-                                                  element_start,
-                                                  element_size);
-
-            if (m_pTracks == NULL)
-                return -1;
-
-            const long status = m_pTracks->Parse();
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x0C53BB6B)  //Cues ID
-        {
-            if (m_pCues == NULL)
-            {
-                m_pCues = new (std::nothrow) Cues(
-                                                this,
-                                                pos,
-                                                size,
-                                                element_start,
-                                                element_size);
-
-                if (m_pCues == NULL)
-                    return -1;
-            }
-        }
-        else if (id == 0x014D9B74)  //SeekHead ID
-        {
-            if (m_pSeekHead == NULL)
-            {
-                m_pSeekHead = new (std::nothrow) SeekHead(
-                                                    this,
-                                                    pos,
-                                                    size,
-                                                    element_start,
-                                                    element_size);
-
-                if (m_pSeekHead == NULL)
-                    return -1;
-
-                const long status = m_pSeekHead->Parse();
-
-                if (status)
-                    return status;
-            }
-        }
-        else if (id == 0x0043A770)  //Chapters ID
-        {
-            if (m_pChapters == NULL)
-            {
-                m_pChapters = new (std::nothrow) Chapters(
-                                this,
-                                pos,
-                                size,
-                                element_start,
-                                element_size);
-
-                if (m_pChapters == NULL)
-                  return -1;
-
-                const long status = m_pChapters->Parse();
-
-                if (status)
-                  return status;
-            }
-        }
-
-        m_pos = pos + size;  //consume payload
+        if (status)
+          return status;
+      }
     }
 
-    assert((segment_stop < 0) || (m_pos <= segment_stop));
+    m_pos = pos + size;  // consume payload
+  }
 
-    if (m_pInfo == NULL)  //TODO: liberalize this behavior
-        return E_FILE_FORMAT_INVALID;
+  assert((segment_stop < 0) || (m_pos <= segment_stop));
 
-    if (m_pTracks == NULL)
-        return E_FILE_FORMAT_INVALID;
+  if (m_pInfo == NULL)  // TODO: liberalize this behavior
+    return E_FILE_FORMAT_INVALID;
 
-    return 0;  //success
+  if (m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
 }
 
+long Segment::LoadCluster(long long& pos, long& len) {
+  for (;;) {
+    const long result = DoLoadCluster(pos, len);
 
-long Segment::LoadCluster(
-    long long& pos,
-    long& len)
-{
-    for (;;)
-    {
-        const long result = DoLoadCluster(pos, len);
+    if (result <= 1)
+      return result;
+  }
+}
 
-        if (result <= 1)
-            return result;
+long Segment::DoLoadCluster(long long& pos, long& len) {
+  if (m_pos < 0)
+    return DoLoadClusterUnknownSize(pos, len);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  long long cluster_off = -1;  // offset relative to start of segment
+  long long cluster_size = -1;  // size of cluster payload
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      return 1;  // no more clusters
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      return 1;  // no more clusters
+
+    pos = m_pos;
+
+    // Read ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
-}
 
+    long long result = GetUIntLength(m_pReader, pos, len);
 
-long Segment::DoLoadCluster(
-    long long& pos,
-    long& len)
-{
-    if (m_pos < 0)
-        return DoLoadClusterUnknownSize(pos, len);
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-    long long total, avail;
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-    long status = m_pReader->Length(&total, &avail);
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-    if (status < 0)  //error
-        return status;
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-    assert((total < 0) || (avail <= total));
+    const long long idpos = pos;
+    const long long id = ReadUInt(m_pReader, idpos, len);
 
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+    if (id < 0)  // error (or underflow)
+      return static_cast<long>(id);
 
-    long long cluster_off = -1;   //offset relative to start of segment
-    long long cluster_size = -1;  //size of cluster payload
+    pos += len;  // consume ID
 
-    for (;;)
-    {
-        if ((total >= 0) && (m_pos >= total))
-            return 1;  //no more clusters
+    // Read Size
 
-        if ((segment_stop >= 0) && (m_pos >= segment_stop))
-            return 1;  //no more clusters
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-        pos = m_pos;
+    result = GetUIntLength(m_pReader, pos, len);
 
-        //Read ID
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        long long result = GetUIntLength(m_pReader, pos, len);
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+    const long long size = ReadUInt(m_pReader, pos, len);
 
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
+    if (size < 0)  // error
+      return static_cast<long>(size);
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+    pos += len;  // consume length of size of element
 
-        const long long idpos = pos;
-        const long long id = ReadUInt(m_pReader, idpos, len);
+    // pos now points to start of payload
 
-        if (id < 0)  //error (or underflow)
-            return static_cast<long>(id);
+    if (size == 0) {  // weird
+      m_pos = pos;
+      continue;
+    }
 
-        pos += len;  //consume ID
+    const long long unknown_size = (1LL << (7 * len)) - 1;
 
-        //Read Size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        pos += len;  //consume length of size of element
-
-        //pos now points to start of payload
-
-        if (size == 0)  //weird
-        {
-            m_pos = pos;
-            continue;
-        }
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-#if 0  //we must handle this to support live webm
+#if 0  // we must handle this to support live webm
         if (size == unknown_size)
             return E_FILE_FORMAT_INVALID;  //TODO: allow this
 #endif
 
-        if ((segment_stop >= 0) &&
-            (size != unknown_size) &&
-            ((pos + size) > segment_stop))
-        {
-            return E_FILE_FORMAT_INVALID;
-        }
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
 
-#if 0  //commented-out, to support incremental cluster parsing
+#if 0  // commented-out, to support incremental cluster parsing
         len = static_cast<long>(size);
 
         if ((pos + size) > avail)
             return E_BUFFER_NOT_FULL;
 #endif
 
-        if (id == 0x0C53BB6B)  //Cues ID
-        {
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;  //TODO: liberalize
+    if (id == 0x0C53BB6B) {  // Cues ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // TODO: liberalize
 
-            if (m_pCues == NULL)
-            {
-                const long long element_size = (pos - idpos) + size;
+      if (m_pCues == NULL) {
+        const long long element_size = (pos - idpos) + size;
 
-                m_pCues = new Cues(this,
-                                   pos,
-                                   size,
-                                   idpos,
-                                   element_size);
-                assert(m_pCues);  //TODO
-            }
+        m_pCues = new Cues(this, pos, size, idpos, element_size);
+        assert(m_pCues);  // TODO
+      }
 
-            m_pos = pos + size;  //consume payload
-            continue;
-        }
-
-        if (id != 0x0F43B675)  //Cluster ID
-        {
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;  //TODO: liberalize
-
-            m_pos = pos + size;  //consume payload
-            continue;
-        }
-
-        //We have a cluster.
-
-        cluster_off = idpos - m_start;  //relative pos
-
-        if (size != unknown_size)
-            cluster_size = size;
-
-        break;
+      m_pos = pos + size;  // consume payload
+      continue;
     }
 
-    assert(cluster_off >= 0);  //have cluster
+    if (id != 0x0F43B675) {  // Cluster ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // TODO: liberalize
 
-    long long pos_;
-    long len_;
-
-    status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
-
-    if (status < 0) //error, or underflow
-    {
-        pos = pos_;
-        len = len_;
-
-        return status;
+      m_pos = pos + size;  // consume payload
+      continue;
     }
 
-    //status == 0 means "no block entries found"
-    //status > 0 means "found at least one block entry"
+    // We have a cluster.
 
-    //TODO:
-    //The issue here is that the segment increments its own
-    //pos ptr past the most recent cluster parsed, and then
-    //starts from there to parse the next cluster.  If we
-    //don't know the size of the current cluster, then we
-    //must either parse its payload (as we do below), looking
-    //for the cluster (or cues) ID to terminate the parse.
-    //This isn't really what we want: rather, we really need
-    //a way to create the curr cluster object immediately.
-    //The pity is that cluster::parse can determine its own
-    //boundary, and we largely duplicate that same logic here.
-    //
-    //Maybe we need to get rid of our look-ahead preloading
-    //in source::parse???
-    //
-    //As we're parsing the blocks in the curr cluster
-    //(in cluster::parse), we should have some way to signal
-    //to the segment that we have determined the boundary,
-    //so it can adjust its own segment::m_pos member.
-    //
-    //The problem is that we're asserting in asyncreadinit,
-    //because we adjust the pos down to the curr seek pos,
-    //and the resulting adjusted len is > 2GB.  I'm suspicious
-    //that this is even correct, but even if it is, we can't
-    //be loading that much data in the cache anyway.
+    cluster_off = idpos - m_start;  // relative pos
 
-    const long idx = m_clusterCount;
+    if (size != unknown_size)
+      cluster_size = size;
 
-    if (m_clusterPreloadCount > 0)
-    {
-        assert(idx < m_clusterSize);
+    break;
+  }
 
-        Cluster* const pCluster = m_clusters[idx];
-        assert(pCluster);
-        assert(pCluster->m_index < 0);
+  assert(cluster_off >= 0);  // have cluster
 
-        const long long off = pCluster->GetPosition();
-        assert(off >= 0);
+  long long pos_;
+  long len_;
 
-        if (off == cluster_off)  //preloaded already
-        {
-            if (status == 0)  //no entries found
-                return E_FILE_FORMAT_INVALID;
+  status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
 
-            if (cluster_size >= 0)
-                pos += cluster_size;
-            else
-            {
-                const long long element_size = pCluster->GetElementSize();
+  if (status < 0) {  // error, or underflow
+    pos = pos_;
+    len = len_;
 
-                if (element_size <= 0)
-                    return E_FILE_FORMAT_INVALID;  //TODO: handle this case
+    return status;
+  }
 
-                pos = pCluster->m_element_start + element_size;
-            }
+  // status == 0 means "no block entries found"
+  // status > 0 means "found at least one block entry"
 
-            pCluster->m_index = idx;  //move from preloaded to loaded
-            ++m_clusterCount;
-            --m_clusterPreloadCount;
+  // TODO:
+  // The issue here is that the segment increments its own
+  // pos ptr past the most recent cluster parsed, and then
+  // starts from there to parse the next cluster.  If we
+  // don't know the size of the current cluster, then we
+  // must either parse its payload (as we do below), looking
+  // for the cluster (or cues) ID to terminate the parse.
+  // This isn't really what we want: rather, we really need
+  // a way to create the curr cluster object immediately.
+  // The pity is that cluster::parse can determine its own
+  // boundary, and we largely duplicate that same logic here.
+  //
+  // Maybe we need to get rid of our look-ahead preloading
+  // in source::parse???
+  //
+  // As we're parsing the blocks in the curr cluster
+  //(in cluster::parse), we should have some way to signal
+  // to the segment that we have determined the boundary,
+  // so it can adjust its own segment::m_pos member.
+  //
+  // The problem is that we're asserting in asyncreadinit,
+  // because we adjust the pos down to the curr seek pos,
+  // and the resulting adjusted len is > 2GB.  I'm suspicious
+  // that this is even correct, but even if it is, we can't
+  // be loading that much data in the cache anyway.
 
-            m_pos = pos;  //consume payload
-            assert((segment_stop < 0) || (m_pos <= segment_stop));
+  const long idx = m_clusterCount;
 
-            return 0;  //success
-        }
-    }
-
-    if (status == 0)  //no entries found
-    {
-        if (cluster_size < 0)
-            return E_FILE_FORMAT_INVALID;  //TODO: handle this
-
-        pos += cluster_size;
-
-        if ((total >= 0) && (pos >= total))
-        {
-            m_pos = total;
-            return 1;  //no more clusters
-        }
-
-        if ((segment_stop >= 0) && (pos >= segment_stop))
-        {
-            m_pos = segment_stop;
-            return 1;  //no more clusters
-        }
-
-        m_pos = pos;
-        return 2;  //try again
-    }
-
-    //status > 0 means we have an entry
-
-    Cluster* const pCluster = Cluster::Create(this,
-                                              idx,
-                                              cluster_off);
-                                              //element_size);
-    assert(pCluster);
-
-    AppendCluster(pCluster);
-    assert(m_clusters);
+  if (m_clusterPreloadCount > 0) {
     assert(idx < m_clusterSize);
-    assert(m_clusters[idx] == pCluster);
 
-    if (cluster_size >= 0)
-    {
+    Cluster* const pCluster = m_clusters[idx];
+    assert(pCluster);
+    assert(pCluster->m_index < 0);
+
+    const long long off = pCluster->GetPosition();
+    assert(off >= 0);
+
+    if (off == cluster_off) {  // preloaded already
+      if (status == 0)  // no entries found
+        return E_FILE_FORMAT_INVALID;
+
+      if (cluster_size >= 0)
         pos += cluster_size;
+      else {
+        const long long element_size = pCluster->GetElementSize();
 
-        m_pos = pos;
-        assert((segment_stop < 0) || (m_pos <= segment_stop));
+        if (element_size <= 0)
+          return E_FILE_FORMAT_INVALID;  // TODO: handle this case
 
-        return 0;
+        pos = pCluster->m_element_start + element_size;
+      }
+
+      pCluster->m_index = idx;  // move from preloaded to loaded
+      ++m_clusterCount;
+      --m_clusterPreloadCount;
+
+      m_pos = pos;  // consume payload
+      assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+      return 0;  // success
+    }
+  }
+
+  if (status == 0) {  // no entries found
+    if (cluster_size < 0)
+      return E_FILE_FORMAT_INVALID;  // TODO: handle this
+
+    pos += cluster_size;
+
+    if ((total >= 0) && (pos >= total)) {
+      m_pos = total;
+      return 1;  // no more clusters
     }
 
-    m_pUnknownSize = pCluster;
-    m_pos = -pos;
+    if ((segment_stop >= 0) && (pos >= segment_stop)) {
+      m_pos = segment_stop;
+      return 1;  // no more clusters
+    }
 
-    return 0;  //partial success, since we have a new cluster
+    m_pos = pos;
+    return 2;  // try again
+  }
 
-    //status == 0 means "no block entries found"
+  // status > 0 means we have an entry
 
-    //pos designates start of payload
-    //m_pos has NOT been adjusted yet (in case we need to come back here)
+  Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
+  // element_size);
+  assert(pCluster);
+
+  AppendCluster(pCluster);
+  assert(m_clusters);
+  assert(idx < m_clusterSize);
+  assert(m_clusters[idx] == pCluster);
+
+  if (cluster_size >= 0) {
+    pos += cluster_size;
+
+    m_pos = pos;
+    assert((segment_stop < 0) || (m_pos <= segment_stop));
+
+    return 0;
+  }
+
+  m_pUnknownSize = pCluster;
+  m_pos = -pos;
+
+  return 0;  // partial success, since we have a new cluster
+
+// status == 0 means "no block entries found"
+
+// pos designates start of payload
+// m_pos has NOT been adjusted yet (in case we need to come back here)
 
 #if 0
 
-    if (cluster_size < 0)  //unknown size
-    {
+    if (cluster_size < 0) {  //unknown size
         const long long payload_pos = pos;  //absolute pos of cluster payload
 
-        for (;;)  //determine cluster size
-        {
+        for (;;) {  //determine cluster size
             if ((total >= 0) && (pos >= total))
                 break;
 
@@ -1523,16 +1344,11 @@
     return 2;     //try to find another cluster
 
 #endif
-
 }
 
-
-long Segment::DoLoadClusterUnknownSize(
-    long long& pos,
-    long& len)
-{
-    assert(m_pos < 0);
-    assert(m_pUnknownSize);
+long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
+  assert(m_pos < 0);
+  assert(m_pUnknownSize);
 
 #if 0
     assert(m_pUnknownSize->GetElementSize() < 0);  //TODO: verify this
@@ -1559,8 +1375,7 @@
 
     long long element_size = -1;
 
-    for (;;)  //determine cluster size
-    {
+    for (;;) {  //determine cluster size
         if ((total >= 0) && (pos >= total))
         {
             element_size = total - element_start;
@@ -1609,8 +1424,7 @@
         //that we have exhausted the sub-element's inside the cluster
         //whose ID we parsed earlier.
 
-        if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) //Cluster ID or Cues ID
-        {
+        if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) {  //Cluster ID or Cues ID
             element_size = pos - element_start;
             assert(element_size > 0);
 
@@ -1687,348 +1501,299 @@
 
     return 2;  //continue parsing
 #else
-    const long status = m_pUnknownSize->Parse(pos, len);
+  const long status = m_pUnknownSize->Parse(pos, len);
 
-    if (status < 0)  //error or underflow
-        return status;
+  if (status < 0)  // error or underflow
+    return status;
 
-    if (status == 0)  //parsed a block
-        return 2;     //continue parsing
+  if (status == 0)  // parsed a block
+    return 2;  // continue parsing
 
-    assert(status > 0);   //nothing left to parse of this cluster
+  assert(status > 0);  // nothing left to parse of this cluster
 
-    const long long start = m_pUnknownSize->m_element_start;
+  const long long start = m_pUnknownSize->m_element_start;
 
-    const long long size = m_pUnknownSize->GetElementSize();
-    assert(size >= 0);
+  const long long size = m_pUnknownSize->GetElementSize();
+  assert(size >= 0);
 
-    pos = start + size;
-    m_pos = pos;
+  pos = start + size;
+  m_pos = pos;
 
-    m_pUnknownSize = 0;
+  m_pUnknownSize = 0;
 
-    return 2;  //continue parsing
+  return 2;  // continue parsing
 #endif
 }
 
+void Segment::AppendCluster(Cluster* pCluster) {
+  assert(pCluster);
+  assert(pCluster->m_index >= 0);
 
-void Segment::AppendCluster(Cluster* pCluster)
-{
-    assert(pCluster);
-    assert(pCluster->m_index >= 0);
+  const long count = m_clusterCount + m_clusterPreloadCount;
 
-    const long count = m_clusterCount + m_clusterPreloadCount;
+  long& size = m_clusterSize;
+  assert(size >= count);
 
-    long& size = m_clusterSize;
-    assert(size >= count);
+  const long idx = pCluster->m_index;
+  assert(idx == m_clusterCount);
 
-    const long idx = pCluster->m_index;
-    assert(idx == m_clusterCount);
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
 
-    if (count >= size)
-    {
-        const long n = (size <= 0) ? 2048 : 2*size;
+    Cluster** const qq = new Cluster* [n];
+    Cluster** q = qq;
 
-        Cluster** const qq = new Cluster*[n];
-        Cluster** q = qq;
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
 
-        Cluster** p = m_clusters;
-        Cluster** const pp = p + count;
+    while (p != pp)
+      *q++ = *p++;
 
-        while (p != pp)
-            *q++ = *p++;
+    delete[] m_clusters;
 
-        delete[] m_clusters;
+    m_clusters = qq;
+    size = n;
+  }
 
-        m_clusters = qq;
-        size = n;
-    }
-
-    if (m_clusterPreloadCount > 0)
-    {
-        assert(m_clusters);
-
-        Cluster** const p = m_clusters + m_clusterCount;
-        assert(*p);
-        assert((*p)->m_index < 0);
-
-        Cluster** q = p + m_clusterPreloadCount;
-        assert(q < (m_clusters + size));
-
-        for (;;)
-        {
-            Cluster** const qq = q - 1;
-            assert((*qq)->m_index < 0);
-
-            *q = *qq;
-            q = qq;
-
-            if (q == p)
-                break;
-        }
-    }
-
-    m_clusters[idx] = pCluster;
-    ++m_clusterCount;
-}
-
-
-void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx)
-{
-    assert(pCluster);
-    assert(pCluster->m_index < 0);
-    assert(idx >= m_clusterCount);
-
-    const long count = m_clusterCount + m_clusterPreloadCount;
-
-    long& size = m_clusterSize;
-    assert(size >= count);
-
-    if (count >= size)
-    {
-        const long n = (size <= 0) ? 2048 : 2*size;
-
-        Cluster** const qq = new Cluster*[n];
-        Cluster** q = qq;
-
-        Cluster** p = m_clusters;
-        Cluster** const pp = p + count;
-
-        while (p != pp)
-            *q++ = *p++;
-
-        delete[] m_clusters;
-
-        m_clusters = qq;
-        size = n;
-    }
-
+  if (m_clusterPreloadCount > 0) {
     assert(m_clusters);
 
-    Cluster** const p = m_clusters + idx;
+    Cluster** const p = m_clusters + m_clusterCount;
+    assert(*p);
+    assert((*p)->m_index < 0);
 
-    Cluster** q = m_clusters + count;
-    assert(q >= p);
+    Cluster** q = p + m_clusterPreloadCount;
     assert(q < (m_clusters + size));
 
-    while (q > p)
-    {
-        Cluster** const qq = q - 1;
-        assert((*qq)->m_index < 0);
+    for (;;) {
+      Cluster** const qq = q - 1;
+      assert((*qq)->m_index < 0);
 
-        *q = *qq;
-        q = qq;
+      *q = *qq;
+      q = qq;
+
+      if (q == p)
+        break;
     }
+  }
 
-    m_clusters[idx] = pCluster;
-    ++m_clusterPreloadCount;
+  m_clusters[idx] = pCluster;
+  ++m_clusterCount;
 }
 
+void Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+  assert(pCluster);
+  assert(pCluster->m_index < 0);
+  assert(idx >= m_clusterCount);
 
-long Segment::Load()
-{
-    assert(m_clusters == NULL);
-    assert(m_clusterSize == 0);
-    assert(m_clusterCount == 0);
-    //assert(m_size >= 0);
+  const long count = m_clusterCount + m_clusterPreloadCount;
 
-    //Outermost (level 0) segment object has been constructed,
-    //and pos designates start of payload.  We need to find the
-    //inner (level 1) elements.
+  long& size = m_clusterSize;
+  assert(size >= count);
 
-    const long long header_status = ParseHeaders();
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
 
-    if (header_status < 0)  //error
-        return static_cast<long>(header_status);
+    Cluster** const qq = new Cluster* [n];
+    Cluster** q = qq;
 
-    if (header_status > 0)  //underflow
-        return E_BUFFER_NOT_FULL;
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
 
-    assert(m_pInfo);
-    assert(m_pTracks);
+    while (p != pp)
+      *q++ = *p++;
 
-    for (;;)
-    {
-        const int status = LoadCluster();
+    delete[] m_clusters;
 
-        if (status < 0)  //error
-            return status;
+    m_clusters = qq;
+    size = n;
+  }
 
-        if (status >= 1)  //no more clusters
-            return 0;
-    }
+  assert(m_clusters);
+
+  Cluster** const p = m_clusters + idx;
+
+  Cluster** q = m_clusters + count;
+  assert(q >= p);
+  assert(q < (m_clusters + size));
+
+  while (q > p) {
+    Cluster** const qq = q - 1;
+    assert((*qq)->m_index < 0);
+
+    *q = *qq;
+    q = qq;
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterPreloadCount;
 }
 
+long Segment::Load() {
+  assert(m_clusters == NULL);
+  assert(m_clusterSize == 0);
+  assert(m_clusterCount == 0);
+  // assert(m_size >= 0);
 
-SeekHead::SeekHead(
-    Segment* pSegment,
-    long long start,
-    long long size_,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_start(start),
-    m_size(size_),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    m_entries(0),
-    m_entry_count(0),
-    m_void_elements(0),
-    m_void_element_count(0)
-{
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+
+  const long long header_status = ParseHeaders();
+
+  if (header_status < 0)  // error
+    return static_cast<long>(header_status);
+
+  if (header_status > 0)  // underflow
+    return E_BUFFER_NOT_FULL;
+
+  assert(m_pInfo);
+  assert(m_pTracks);
+
+  for (;;) {
+    const int status = LoadCluster();
+
+    if (status < 0)  // error
+      return status;
+
+    if (status >= 1)  // no more clusters
+      return 0;
+  }
 }
 
+SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
+                   long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_entries(0),
+      m_entry_count(0),
+      m_void_elements(0),
+      m_void_element_count(0) {}
 
-SeekHead::~SeekHead()
-{
-    delete[] m_entries;
-    delete[] m_void_elements;
+SeekHead::~SeekHead() {
+  delete[] m_entries;
+  delete[] m_void_elements;
 }
 
+long SeekHead::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
-long SeekHead::Parse()
-{
-    IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
 
-    long long pos = m_start;
-    const long long stop = m_start + m_size;
+  // first count the seek head entries
 
-    //first count the seek head entries
+  int entry_count = 0;
+  int void_element_count = 0;
 
-    int entry_count = 0;
-    int void_element_count = 0;
+  while (pos < stop) {
+    long long id, size;
 
-    while (pos < stop)
-    {
-        long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
 
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
+    if (status < 0)  // error
+      return status;
 
-        if (status < 0)  //error
-            return status;
+    if (id == 0x0DBB)  // SeekEntry ID
+      ++entry_count;
+    else if (id == 0x6C)  // Void ID
+      ++void_element_count;
 
-        if (id == 0x0DBB)  //SeekEntry ID
-            ++entry_count;
-        else if (id == 0x6C)  //Void ID
-            ++void_element_count;
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-        pos += size;  //consume payload
-        assert(pos <= stop);
+  assert(pos == stop);
+
+  m_entries = new (std::nothrow) Entry[entry_count];
+
+  if (m_entries == NULL)
+    return -1;
+
+  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+
+  if (m_void_elements == NULL)
+    return -1;
+
+  // now parse the entries and void elements
+
+  Entry* pEntry = m_entries;
+  VoidElement* pVoidElement = m_void_elements;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == 0x0DBB) {  // SeekEntry ID
+      if (ParseEntry(pReader, pos, size, pEntry)) {
+        Entry& e = *pEntry++;
+
+        e.element_start = idpos;
+        e.element_size = (pos + size) - idpos;
+      }
+    } else if (id == 0x6C) {  // Void ID
+      VoidElement& e = *pVoidElement++;
+
+      e.element_start = idpos;
+      e.element_size = (pos + size) - idpos;
     }
 
-    assert(pos == stop);
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-    m_entries = new (std::nothrow) Entry[entry_count];
+  assert(pos == stop);
 
-    if (m_entries == NULL)
-        return -1;
+  ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
+  assert(count_ >= 0);
+  assert(count_ <= entry_count);
 
-    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+  m_entry_count = static_cast<int>(count_);
 
-    if (m_void_elements == NULL)
-        return -1;
+  count_ = ptrdiff_t(pVoidElement - m_void_elements);
+  assert(count_ >= 0);
+  assert(count_ <= void_element_count);
 
-    //now parse the entries and void elements
+  m_void_element_count = static_cast<int>(count_);
 
-    Entry* pEntry = m_entries;
-    VoidElement* pVoidElement = m_void_elements;
+  return 0;
+}
 
-    pos = m_start;
+int SeekHead::GetCount() const { return m_entry_count; }
 
-    while (pos < stop)
-    {
-        const long long idpos = pos;
-
-        long long id, size;
-
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
-
-        if (status < 0)  //error
-            return status;
-
-        if (id == 0x0DBB)  //SeekEntry ID
-        {
-            if (ParseEntry(pReader, pos, size, pEntry))
-            {
-                Entry& e = *pEntry++;
-
-                e.element_start = idpos;
-                e.element_size = (pos + size) - idpos;
-            }
-        }
-        else if (id == 0x6C)  //Void ID
-        {
-            VoidElement& e = *pVoidElement++;
-
-            e.element_start = idpos;
-            e.element_size = (pos + size) - idpos;
-        }
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-
-    ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
-    assert(count_ >= 0);
-    assert(count_ <= entry_count);
-
-    m_entry_count = static_cast<int>(count_);
-
-    count_ = ptrdiff_t(pVoidElement - m_void_elements);
-    assert(count_ >= 0);
-    assert(count_ <= void_element_count);
-
-    m_void_element_count = static_cast<int>(count_);
-
+const SeekHead::Entry* SeekHead::GetEntry(int idx) const {
+  if (idx < 0)
     return 0;
+
+  if (idx >= m_entry_count)
+    return 0;
+
+  return m_entries + idx;
 }
 
+int SeekHead::GetVoidElementCount() const { return m_void_element_count; }
 
-int SeekHead::GetCount() const
-{
-    return m_entry_count;
+const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_void_element_count)
+    return 0;
+
+  return m_void_elements + idx;
 }
 
-const SeekHead::Entry* SeekHead::GetEntry(int idx) const
-{
-    if (idx < 0)
-        return 0;
-
-    if (idx >= m_entry_count)
-        return 0;
-
-    return m_entries + idx;
-}
-
-int SeekHead::GetVoidElementCount() const
-{
-    return m_void_element_count;
-}
-
-const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const
-{
-    if (idx < 0)
-        return 0;
-
-    if (idx >= m_void_element_count)
-        return 0;
-
-    return m_void_elements + idx;
-}
-
-
 #if 0
 void Segment::ParseCues(long long off)
 {
@@ -2078,133 +1843,122 @@
     //os << "Segment::ParseCues (end)" << endl;
 }
 #else
-long Segment::ParseCues(
-    long long off,
-    long long& pos,
-    long& len)
-{
-    if (m_pCues)
-        return 0;  //success
+long Segment::ParseCues(long long off, long long& pos, long& len) {
+  if (m_pCues)
+    return 0;  // success
 
-    if (off < 0)
-        return -1;
+  if (off < 0)
+    return -1;
 
-    long long total, avail;
+  long long total, avail;
 
-    const int status = m_pReader->Length(&total, &avail);
+  const int status = m_pReader->Length(&total, &avail);
 
-    if (status < 0)  //error
-        return status;
+  if (status < 0)  // error
+    return status;
 
-    assert((total < 0) || (avail <= total));
+  assert((total < 0) || (avail <= total));
 
-    pos = m_start + off;
+  pos = m_start + off;
 
-    if ((total < 0) || (pos >= total))
-        return 1;  //don't bother parsing cues
+  if ((total < 0) || (pos >= total))
+    return 1;  // don't bother parsing cues
 
-    const long long element_start = pos;
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+  const long long element_start = pos;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
 
-    if ((pos + 1) > avail)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
-    }
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    long long result = GetUIntLength(m_pReader, pos, len);
+  long long result = GetUIntLength(m_pReader, pos, len);
 
-    if (result < 0)  //error
-        return static_cast<long>(result);
+  if (result < 0)  // error
+    return static_cast<long>(result);
 
-    if (result > 0) //underflow (weird)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
-    }
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-        return E_FILE_FORMAT_INVALID;
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((pos + len) > avail)
-        return E_BUFFER_NOT_FULL;
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
 
-    const long long idpos = pos;
+  const long long idpos = pos;
 
-    const long long id = ReadUInt(m_pReader, idpos, len);
+  const long long id = ReadUInt(m_pReader, idpos, len);
 
-    if (id != 0x0C53BB6B)  //Cues ID
-        return E_FILE_FORMAT_INVALID;
+  if (id != 0x0C53BB6B)  // Cues ID
+    return E_FILE_FORMAT_INVALID;
 
-    pos += len;  //consume ID
-    assert((segment_stop < 0) || (pos <= segment_stop));
+  pos += len;  // consume ID
+  assert((segment_stop < 0) || (pos <= segment_stop));
 
-    //Read Size
+  // Read Size
 
-    if ((pos + 1) > avail)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
-    }
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    result = GetUIntLength(m_pReader, pos, len);
+  result = GetUIntLength(m_pReader, pos, len);
 
-    if (result < 0)  //error
-        return static_cast<long>(result);
+  if (result < 0)  // error
+    return static_cast<long>(result);
 
-    if (result > 0) //underflow (weird)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
-    }
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-        return E_FILE_FORMAT_INVALID;
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((pos + len) > avail)
-        return E_BUFFER_NOT_FULL;
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
 
-    const long long size = ReadUInt(m_pReader, pos, len);
+  const long long size = ReadUInt(m_pReader, pos, len);
 
-    if (size < 0)  //error
-        return static_cast<long>(size);
+  if (size < 0)  // error
+    return static_cast<long>(size);
 
-    if (size == 0)  //weird, although technically not illegal
-        return 1;   //done
+  if (size == 0)  // weird, although technically not illegal
+    return 1;  // done
 
-    pos += len;  //consume length of size of element
-    assert((segment_stop < 0) || (pos <= segment_stop));
+  pos += len;  // consume length of size of element
+  assert((segment_stop < 0) || (pos <= segment_stop));
 
-    //Pos now points to start of payload
+  // Pos now points to start of payload
 
-    const long long element_stop = pos + size;
+  const long long element_stop = pos + size;
 
-    if ((segment_stop >= 0) && (element_stop > segment_stop))
-        return E_FILE_FORMAT_INVALID;
+  if ((segment_stop >= 0) && (element_stop > segment_stop))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((total >= 0) && (element_stop > total))
-        return 1;  //don't bother parsing anymore
+  if ((total >= 0) && (element_stop > total))
+    return 1;  // don't bother parsing anymore
 
-    len = static_cast<long>(size);
+  len = static_cast<long>(size);
 
-    if (element_stop > avail)
-        return E_BUFFER_NOT_FULL;
+  if (element_stop > avail)
+    return E_BUFFER_NOT_FULL;
 
-    const long long element_size = element_stop - element_start;
+  const long long element_size = element_stop - element_start;
 
-    m_pCues = new (std::nothrow) Cues(
-                                    this,
-                                    pos,
-                                    size,
-                                    element_start,
-                                    element_size);
-    assert(m_pCues);  //TODO
+  m_pCues =
+      new (std::nothrow) Cues(this, pos, size, element_start, element_size);
+  assert(m_pCues);  // TODO
 
-    return 0;  //success
+  return 0;  // success
 }
 #endif
 
-
 #if 0
 void Segment::ParseSeekEntry(
     long long start,
@@ -2264,304 +2018,269 @@
         ParseCues(seekOff);
 }
 #else
-bool SeekHead::ParseEntry(
-    IMkvReader* pReader,
-    long long start,
-    long long size_,
-    Entry* pEntry)
-{
-    if (size_ <= 0)
-        return false;
+bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
+                          Entry* pEntry) {
+  if (size_ <= 0)
+    return false;
 
-    long long pos = start;
-    const long long stop = start + size_;
+  long long pos = start;
+  const long long stop = start + size_;
 
-    long len;
+  long len;
 
-    //parse the container for the level-1 element ID
+  // parse the container for the level-1 element ID
 
-    const long long seekIdId = ReadUInt(pReader, pos, len);
-    //seekIdId;
+  const long long seekIdId = ReadUInt(pReader, pos, len);
+  // seekIdId;
 
-    if (seekIdId != 0x13AB)  //SeekID ID
-        return false;
+  if (seekIdId != 0x13AB)  // SeekID ID
+    return false;
 
-    if ((pos + len) > stop)
-        return false;
+  if ((pos + len) > stop)
+    return false;
 
-    pos += len;  //consume SeekID id
+  pos += len;  // consume SeekID id
 
-    const long long seekIdSize = ReadUInt(pReader, pos, len);
+  const long long seekIdSize = ReadUInt(pReader, pos, len);
 
-    if (seekIdSize <= 0)
-        return false;
+  if (seekIdSize <= 0)
+    return false;
 
-    if ((pos + len) > stop)
-        return false;
+  if ((pos + len) > stop)
+    return false;
 
-    pos += len;  //consume size of field
+  pos += len;  // consume size of field
 
-    if ((pos + seekIdSize) > stop)
-        return false;
+  if ((pos + seekIdSize) > stop)
+    return false;
 
-    //Note that the SeekId payload really is serialized
-    //as a "Matroska integer", not as a plain binary value.
-    //In fact, Matroska requires that ID values in the
-    //stream exactly match the binary representation as listed
-    //in the Matroska specification.
-    //
-    //This parser is more liberal, and permits IDs to have
-    //any width.  (This could make the representation in the stream
-    //different from what's in the spec, but it doesn't matter here,
-    //since we always normalize "Matroska integer" values.)
+  // Note that the SeekId payload really is serialized
+  // as a "Matroska integer", not as a plain binary value.
+  // In fact, Matroska requires that ID values in the
+  // stream exactly match the binary representation as listed
+  // in the Matroska specification.
+  //
+  // This parser is more liberal, and permits IDs to have
+  // any width.  (This could make the representation in the stream
+  // different from what's in the spec, but it doesn't matter here,
+  // since we always normalize "Matroska integer" values.)
 
-    pEntry->id = ReadUInt(pReader, pos, len);  //payload
+  pEntry->id = ReadUInt(pReader, pos, len);  // payload
 
-    if (pEntry->id <= 0)
-        return false;
+  if (pEntry->id <= 0)
+    return false;
 
-    if (len != seekIdSize)
-        return false;
+  if (len != seekIdSize)
+    return false;
 
-    pos += seekIdSize;  //consume SeekID payload
+  pos += seekIdSize;  // consume SeekID payload
 
-    const long long seekPosId = ReadUInt(pReader, pos, len);
+  const long long seekPosId = ReadUInt(pReader, pos, len);
 
-    if (seekPosId != 0x13AC)  //SeekPos ID
-        return false;
+  if (seekPosId != 0x13AC)  // SeekPos ID
+    return false;
 
-    if ((pos + len) > stop)
-        return false;
+  if ((pos + len) > stop)
+    return false;
 
-    pos += len;  //consume id
+  pos += len;  // consume id
 
-    const long long seekPosSize = ReadUInt(pReader, pos, len);
+  const long long seekPosSize = ReadUInt(pReader, pos, len);
 
-    if (seekPosSize <= 0)
-        return false;
+  if (seekPosSize <= 0)
+    return false;
 
-    if ((pos + len) > stop)
-        return false;
+  if ((pos + len) > stop)
+    return false;
 
-    pos += len;  //consume size
+  pos += len;  // consume size
 
-    if ((pos + seekPosSize) > stop)
-        return false;
+  if ((pos + seekPosSize) > stop)
+    return false;
 
-    pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
+  pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
 
-    if (pEntry->pos < 0)
-        return false;
+  if (pEntry->pos < 0)
+    return false;
 
-    pos += seekPosSize;  //consume payload
+  pos += seekPosSize;  // consume payload
 
-    if (pos != stop)
-        return false;
+  if (pos != stop)
+    return false;
 
-    return true;
+  return true;
 }
 #endif
 
+Cues::Cues(Segment* pSegment, long long start_, long long size_,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start_),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_cue_points(NULL),
+      m_count(0),
+      m_preload_count(0),
+      m_pos(start_) {}
 
-Cues::Cues(
-    Segment* pSegment,
-    long long start_,
-    long long size_,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_start(start_),
-    m_size(size_),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    m_cue_points(NULL),
-    m_count(0),
-    m_preload_count(0),
-    m_pos(start_)
-{
+Cues::~Cues() {
+  const long n = m_count + m_preload_count;
+
+  CuePoint** p = m_cue_points;
+  CuePoint** const q = p + n;
+
+  while (p != q) {
+    CuePoint* const pCP = *p++;
+    assert(pCP);
+
+    delete pCP;
+  }
+
+  delete[] m_cue_points;
 }
 
+long Cues::GetCount() const {
+  if (m_cue_points == NULL)
+    return -1;
 
-Cues::~Cues()
-{
-    const long n = m_count + m_preload_count;
+  return m_count;  // TODO: really ignore preload count?
+}
 
-    CuePoint** p = m_cue_points;
-    CuePoint** const q = p + n;
+bool Cues::DoneParsing() const {
+  const long long stop = m_start + m_size;
+  return (m_pos >= stop);
+}
 
-    while (p != q)
-    {
-        CuePoint* const pCP = *p++;
-        assert(pCP);
+void Cues::Init() const {
+  if (m_cue_points)
+    return;
 
-        delete pCP;
-    }
+  assert(m_count == 0);
+  assert(m_preload_count == 0);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  const long long stop = m_start + m_size;
+  long long pos = m_start;
+
+  long cue_points_size = 0;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long len;
+
+    const long long id = ReadUInt(pReader, pos, len);
+    assert(id >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == 0x3B)  // CuePoint ID
+      PreloadCuePoint(cue_points_size, idpos);
+
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
+}
+
+void Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+  assert(m_count == 0);
+
+  if (m_preload_count >= cue_points_size) {
+    const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
+
+    CuePoint** const qq = new CuePoint* [n];
+    CuePoint** q = qq;  // beginning of target
+
+    CuePoint** p = m_cue_points;  // beginning of source
+    CuePoint** const pp = p + m_preload_count;  // end of source
+
+    while (p != pp)
+      *q++ = *p++;
 
     delete[] m_cue_points;
+
+    m_cue_points = qq;
+    cue_points_size = n;
+  }
+
+  CuePoint* const pCP = new CuePoint(m_preload_count, pos);
+  m_cue_points[m_preload_count++] = pCP;
 }
 
+bool Cues::LoadCuePoint() const {
+  // odbgstream os;
+  // os << "Cues::LoadCuePoint" << endl;
 
-long Cues::GetCount() const
-{
-    if (m_cue_points == NULL)
-        return -1;
+  const long long stop = m_start + m_size;
 
-    return m_count;  //TODO: really ignore preload count?
-}
+  if (m_pos >= stop)
+    return false;  // nothing else to do
 
+  Init();
 
-bool Cues::DoneParsing() const
-{
-    const long long stop = m_start + m_size;
-    return (m_pos >= stop);
-}
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
+  while (m_pos < stop) {
+    const long long idpos = m_pos;
 
-void Cues::Init() const
-{
-    if (m_cue_points)
-        return;
+    long len;
 
-    assert(m_count == 0);
-    assert(m_preload_count == 0);
+    const long long id = ReadUInt(pReader, m_pos, len);
+    assert(id >= 0);  // TODO
+    assert((m_pos + len) <= stop);
 
-    IMkvReader* const pReader = m_pSegment->m_pReader;
+    m_pos += len;  // consume ID
 
-    const long long stop = m_start + m_size;
-    long long pos = m_start;
+    const long long size = ReadUInt(pReader, m_pos, len);
+    assert(size >= 0);
+    assert((m_pos + len) <= stop);
 
-    long cue_points_size = 0;
+    m_pos += len;  // consume Size field
+    assert((m_pos + size) <= stop);
 
-    while (pos < stop)
-    {
-        const long long idpos = pos;
+    if (id != 0x3B) {  // CuePoint ID
+      m_pos += size;  // consume payload
+      assert(m_pos <= stop);
 
-        long len;
-
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume ID
-
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume Size field
-        assert((pos + size) <= stop);
-
-        if (id == 0x3B)  //CuePoint ID
-            PreloadCuePoint(cue_points_size, idpos);
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-}
-
-
-void Cues::PreloadCuePoint(
-    long& cue_points_size,
-    long long pos) const
-{
-    assert(m_count == 0);
-
-    if (m_preload_count >= cue_points_size)
-    {
-        const long n = (cue_points_size <= 0) ? 2048 : 2*cue_points_size;
-
-        CuePoint** const qq = new CuePoint*[n];
-        CuePoint** q = qq;  //beginning of target
-
-        CuePoint** p = m_cue_points;                //beginning of source
-        CuePoint** const pp = p + m_preload_count;  //end of source
-
-        while (p != pp)
-            *q++ = *p++;
-
-        delete[] m_cue_points;
-
-        m_cue_points = qq;
-        cue_points_size = n;
+      continue;
     }
 
-    CuePoint* const pCP = new CuePoint(m_preload_count, pos);
-    m_cue_points[m_preload_count++] = pCP;
+    assert(m_preload_count > 0);
+
+    CuePoint* const pCP = m_cue_points[m_count];
+    assert(pCP);
+    assert((pCP->GetTimeCode() >= 0) || (-pCP->GetTimeCode() == idpos));
+    if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
+      return false;
+
+    pCP->Load(pReader);
+    ++m_count;
+    --m_preload_count;
+
+    m_pos += size;  // consume payload
+    assert(m_pos <= stop);
+
+    return true;  // yes, we loaded a cue point
+  }
+
+  // return (m_pos < stop);
+  return false;  // no, we did not load a cue point
 }
 
-
-bool Cues::LoadCuePoint() const
-{
-    //odbgstream os;
-    //os << "Cues::LoadCuePoint" << endl;
-
-    const long long stop = m_start + m_size;
-
-    if (m_pos >= stop)
-        return false;  //nothing else to do
-
-    Init();
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    while (m_pos < stop)
-    {
-        const long long idpos = m_pos;
-
-        long len;
-
-        const long long id = ReadUInt(pReader, m_pos, len);
-        assert(id >= 0);  //TODO
-        assert((m_pos + len) <= stop);
-
-        m_pos += len;  //consume ID
-
-        const long long size = ReadUInt(pReader, m_pos, len);
-        assert(size >= 0);
-        assert((m_pos + len) <= stop);
-
-        m_pos += len;  //consume Size field
-        assert((m_pos + size) <= stop);
-
-        if (id != 0x3B)  //CuePoint ID
-        {
-            m_pos += size;  //consume payload
-            assert(m_pos <= stop);
-
-            continue;
-        }
-
-        assert(m_preload_count > 0);
-
-        CuePoint* const pCP = m_cue_points[m_count];
-        assert(pCP);
-        assert((pCP->GetTimeCode() >= 0) || (-pCP->GetTimeCode() == idpos));
-        if (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))
-            return false;
-
-        pCP->Load(pReader);
-        ++m_count;
-        --m_preload_count;
-
-        m_pos += size;  //consume payload
-        assert(m_pos <= stop);
-
-        return true;  //yes, we loaded a cue point
-    }
-
-    //return (m_pos < stop);
-    return false;  //no, we did not load a cue point
-}
-
-
-bool Cues::Find(
-    long long time_ns,
-    const Track* pTrack,
-    const CuePoint*& pCP,
-    const CuePoint::TrackPosition*& pTP) const
-{
-    assert(time_ns >= 0);
-    assert(pTrack);
+bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
+                const CuePoint::TrackPosition*& pTP) const {
+  assert(time_ns >= 0);
+  assert(pTrack);
 
 #if 0
     LoadCuePoint();  //establish invariant
@@ -2619,71 +2338,68 @@
     assert(pCP);
     assert(pCP->GetTime(m_pSegment) <= time_ns);
 #else
-    if (m_cue_points == NULL)
-        return false;
+  if (m_cue_points == NULL)
+    return false;
 
-    if (m_count == 0)
-        return false;
+  if (m_count == 0)
+    return false;
 
-    CuePoint** const ii = m_cue_points;
-    CuePoint** i = ii;
+  CuePoint** const ii = m_cue_points;
+  CuePoint** i = ii;
 
-    CuePoint** const jj = ii + m_count;
-    CuePoint** j = jj;
+  CuePoint** const jj = ii + m_count;
+  CuePoint** j = jj;
 
-    pCP = *i;
-    assert(pCP);
+  pCP = *i;
+  assert(pCP);
 
-    if (time_ns <= pCP->GetTime(m_pSegment))
-    {
-        pTP = pCP->Find(pTrack);
-        return (pTP != NULL);
-    }
-
-    while (i < j)
-    {
-        //INVARIANT:
-        //[ii, i) <= time_ns
-        //[i, j)  ?
-        //[j, jj) > time_ns
-
-        CuePoint** const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        CuePoint* const pCP = *k;
-        assert(pCP);
-
-        const long long t = pCP->GetTime(m_pSegment);
-
-        if (t <= time_ns)
-            i = k + 1;
-        else
-            j = k;
-
-        assert(i <= j);
-    }
-
-    assert(i == j);
-    assert(i <= jj);
-    assert(i > ii);
-
-    pCP = *--i;
-    assert(pCP);
-    assert(pCP->GetTime(m_pSegment) <= time_ns);
-#endif
-
-    //TODO: here and elsewhere, it's probably not correct to search
-    //for the cue point with this time, and then search for a matching
-    //track.  In principle, the matching track could be on some earlier
-    //cue point, and with our current algorithm, we'd miss it.  To make
-    //this bullet-proof, we'd need to create a secondary structure,
-    //with a list of cue points that apply to a track, and then search
-    //that track-based structure for a matching cue point.
-
+  if (time_ns <= pCP->GetTime(m_pSegment)) {
     pTP = pCP->Find(pTrack);
     return (pTP != NULL);
-}
+  }
 
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) <= time_ns
+    //[i, j)  ?
+    //[j, jj) > time_ns
+
+    CuePoint** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    CuePoint* const pCP = *k;
+    assert(pCP);
+
+    const long long t = pCP->GetTime(m_pSegment);
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    assert(i <= j);
+  }
+
+  assert(i == j);
+  assert(i <= jj);
+  assert(i > ii);
+
+  pCP = *--i;
+  assert(pCP);
+  assert(pCP->GetTime(m_pSegment) <= time_ns);
+#endif
+
+  // TODO: here and elsewhere, it's probably not correct to search
+  // for the cue point with this time, and then search for a matching
+  // track.  In principle, the matching track could be on some earlier
+  // cue point, and with our current algorithm, we'd miss it.  To make
+  // this bullet-proof, we'd need to create a secondary structure,
+  // with a list of cue points that apply to a track, and then search
+  // that track-based structure for a matching cue point.
+
+  pTP = pCP->Find(pTrack);
+  return (pTP != NULL);
+}
 
 #if 0
 bool Cues::FindNext(
@@ -2744,14 +2460,12 @@
 }
 #endif
 
+const CuePoint* Cues::GetFirst() const {
+  if (m_cue_points == NULL)
+    return NULL;
 
-const CuePoint* Cues::GetFirst() const
-{
-    if (m_cue_points == NULL)
-        return NULL;
-
-    if (m_count == 0)
-        return NULL;
+  if (m_count == 0)
+    return NULL;
 
 #if 0
     LoadCuePoint();  //init cues
@@ -2762,24 +2476,22 @@
         return NULL;
 #endif
 
-    CuePoint* const* const pp = m_cue_points;
-    assert(pp);
+  CuePoint* const* const pp = m_cue_points;
+  assert(pp);
 
-    CuePoint* const pCP = pp[0];
-    assert(pCP);
-    assert(pCP->GetTimeCode() >= 0);
+  CuePoint* const pCP = pp[0];
+  assert(pCP);
+  assert(pCP->GetTimeCode() >= 0);
 
-    return pCP;
+  return pCP;
 }
 
+const CuePoint* Cues::GetLast() const {
+  if (m_cue_points == NULL)
+    return NULL;
 
-const CuePoint* Cues::GetLast() const
-{
-    if (m_cue_points == NULL)
-        return NULL;
-
-    if (m_count <= 0)
-        return NULL;
+  if (m_count <= 0)
+    return NULL;
 
 #if 0
     LoadCuePoint();  //init cues
@@ -2800,28 +2512,26 @@
     pCP->Load(m_pSegment->m_pReader);
     assert(pCP->GetTimeCode() >= 0);
 #else
-    const long index = m_count - 1;
+  const long index = m_count - 1;
 
-    CuePoint* const* const pp = m_cue_points;
-    assert(pp);
+  CuePoint* const* const pp = m_cue_points;
+  assert(pp);
 
-    CuePoint* const pCP = pp[index];
-    assert(pCP);
-    assert(pCP->GetTimeCode() >= 0);
+  CuePoint* const pCP = pp[index];
+  assert(pCP);
+  assert(pCP->GetTimeCode() >= 0);
 #endif
 
-    return pCP;
+  return pCP;
 }
 
+const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
+  if (pCurr == NULL)
+    return NULL;
 
-const CuePoint* Cues::GetNext(const CuePoint* pCurr) const
-{
-    if (pCurr == NULL)
-        return NULL;
-
-    assert(pCurr->GetTimeCode() >= 0);
-    assert(m_cue_points);
-    assert(m_count >= 1);
+  assert(pCurr->GetTimeCode() >= 0);
+  assert(m_cue_points);
+  assert(m_count >= 1);
 
 #if 0
     const size_t count = m_count + m_preload_count;
@@ -2843,386 +2553,347 @@
 
     pNext->Load(m_pSegment->m_pReader);
 #else
-    long index = pCurr->m_index;
-    assert(index < m_count);
+  long index = pCurr->m_index;
+  assert(index < m_count);
 
-    CuePoint* const* const pp = m_cue_points;
-    assert(pp);
-    assert(pp[index] == pCurr);
+  CuePoint* const* const pp = m_cue_points;
+  assert(pp);
+  assert(pp[index] == pCurr);
 
-    ++index;
+  ++index;
 
-    if (index >= m_count)
-        return NULL;
+  if (index >= m_count)
+    return NULL;
 
-    CuePoint* const pNext = pp[index];
-    assert(pNext);
-    assert(pNext->GetTimeCode() >= 0);
+  CuePoint* const pNext = pp[index];
+  assert(pNext);
+  assert(pNext->GetTimeCode() >= 0);
 #endif
 
-    return pNext;
+  return pNext;
 }
 
+const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
+                                 const CuePoint::TrackPosition* pTP) const {
+  if (pCP == NULL)
+    return NULL;
 
-const BlockEntry* Cues::GetBlock(
-    const CuePoint* pCP,
-    const CuePoint::TrackPosition* pTP) const
-{
-    if (pCP == NULL)
-        return NULL;
+  if (pTP == NULL)
+    return NULL;
 
-    if (pTP == NULL)
-        return NULL;
-
-    return m_pSegment->GetBlock(*pCP, *pTP);
+  return m_pSegment->GetBlock(*pCP, *pTP);
 }
 
+const BlockEntry* Segment::GetBlock(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) {
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
 
-const BlockEntry* Segment::GetBlock(
-    const CuePoint& cp,
-    const CuePoint::TrackPosition& tp)
-{
-    Cluster** const ii = m_clusters;
-    Cluster** i = ii;
+  const long count = m_clusterCount + m_clusterPreloadCount;
 
-    const long count = m_clusterCount + m_clusterPreloadCount;
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
 
-    Cluster** const jj = ii + count;
-    Cluster** j = jj;
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
 
-    while (i < j)
-    {
-        //INVARIANT:
-        //[ii, i) < pTP->m_pos
-        //[i, j) ?
-        //[j, jj)  > pTP->m_pos
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
 
-        Cluster** const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        Cluster* const pCluster = *k;
-        assert(pCluster);
-
-        //const long long pos_ = pCluster->m_pos;
-        //assert(pos_);
-        //const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
-
-        const long long pos = pCluster->GetPosition();
-        assert(pos >= 0);
-
-        if (pos < tp.m_pos)
-            i = k + 1;
-        else if (pos > tp.m_pos)
-            j = k;
-        else
-            return pCluster->GetEntry(cp, tp);
-    }
-
-    assert(i == j);
-    //assert(Cluster::HasBlockEntries(this, tp.m_pos));
-
-    Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1);
+    Cluster* const pCluster = *k;
     assert(pCluster);
 
-    const ptrdiff_t idx = i - m_clusters;
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
 
-    PreloadCluster(pCluster, idx);
-    assert(m_clusters);
-    assert(m_clusterPreloadCount > 0);
-    assert(m_clusters[idx] == pCluster);
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
 
-    return pCluster->GetEntry(cp, tp);
+    if (pos < tp.m_pos)
+      i = k + 1;
+    else if (pos > tp.m_pos)
+      j = k;
+    else
+      return pCluster->GetEntry(cp, tp);
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos);  //, -1);
+  assert(pCluster);
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  PreloadCluster(pCluster, idx);
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster->GetEntry(cp, tp);
 }
 
+const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
+  if (requested_pos < 0)
+    return 0;
 
-const Cluster* Segment::FindOrPreloadCluster(long long requested_pos)
-{
-    if (requested_pos < 0)
-        return 0;
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
 
-    Cluster** const ii = m_clusters;
-    Cluster** i = ii;
+  const long count = m_clusterCount + m_clusterPreloadCount;
 
-    const long count = m_clusterCount + m_clusterPreloadCount;
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
 
-    Cluster** const jj = ii + count;
-    Cluster** j = jj;
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
 
-    while (i < j)
-    {
-        //INVARIANT:
-        //[ii, i) < pTP->m_pos
-        //[i, j) ?
-        //[j, jj)  > pTP->m_pos
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
 
-        Cluster** const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        Cluster* const pCluster = *k;
-        assert(pCluster);
-
-        //const long long pos_ = pCluster->m_pos;
-        //assert(pos_);
-        //const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
-
-        const long long pos = pCluster->GetPosition();
-        assert(pos >= 0);
-
-        if (pos < requested_pos)
-            i = k + 1;
-        else if (pos > requested_pos)
-            j = k;
-        else
-            return pCluster;
-    }
-
-    assert(i == j);
-    //assert(Cluster::HasBlockEntries(this, tp.m_pos));
-
-    Cluster* const pCluster = Cluster::Create(
-                                this,
-                                -1,
-                                requested_pos);
-                                //-1);
+    Cluster* const pCluster = *k;
     assert(pCluster);
 
-    const ptrdiff_t idx = i - m_clusters;
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
 
-    PreloadCluster(pCluster, idx);
-    assert(m_clusters);
-    assert(m_clusterPreloadCount > 0);
-    assert(m_clusters[idx] == pCluster);
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
 
-    return pCluster;
+    if (pos < requested_pos)
+      i = k + 1;
+    else if (pos > requested_pos)
+      j = k;
+    else
+      return pCluster;
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
+  //-1);
+  assert(pCluster);
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  PreloadCluster(pCluster, idx);
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster;
 }
 
-
-CuePoint::CuePoint(long idx, long long pos) :
-    m_element_start(0),
-    m_element_size(0),
-    m_index(idx),
-    m_timecode(-1 * pos),
-    m_track_positions(NULL),
-    m_track_positions_count(0)
-{
-    assert(pos > 0);
+CuePoint::CuePoint(long idx, long long pos)
+    : m_element_start(0),
+      m_element_size(0),
+      m_index(idx),
+      m_timecode(-1 * pos),
+      m_track_positions(NULL),
+      m_track_positions_count(0) {
+  assert(pos > 0);
 }
 
+CuePoint::~CuePoint() { delete[] m_track_positions; }
 
-CuePoint::~CuePoint()
-{
-    delete[] m_track_positions;
-}
+void CuePoint::Load(IMkvReader* pReader) {
+  // odbgstream os;
+  // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
 
+  if (m_timecode >= 0)  // already loaded
+    return;
 
-void CuePoint::Load(IMkvReader* pReader)
-{
-    //odbgstream os;
-    //os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
+  assert(m_track_positions == NULL);
+  assert(m_track_positions_count == 0);
 
-    if (m_timecode >= 0)  //already loaded
-        return;
+  long long pos_ = -m_timecode;
+  const long long element_start = pos_;
 
-    assert(m_track_positions == NULL);
-    assert(m_track_positions_count == 0);
+  long long stop;
 
-    long long pos_ = -m_timecode;
-    const long long element_start = pos_;
+  {
+    long len;
 
-    long long stop;
+    const long long id = ReadUInt(pReader, pos_, len);
+    assert(id == 0x3B);  // CuePoint ID
+    if (id != 0x3B)
+      return;
 
-    {
-        long len;
+    pos_ += len;  // consume ID
 
-        const long long id = ReadUInt(pReader, pos_, len);
-        assert(id == 0x3B);  //CuePoint ID
-        if (id != 0x3B)
-            return;
+    const long long size = ReadUInt(pReader, pos_, len);
+    assert(size >= 0);
 
-        pos_ += len;  //consume ID
+    pos_ += len;  // consume Size field
+    // pos_ now points to start of payload
 
-        const long long size = ReadUInt(pReader, pos_, len);
-        assert(size >= 0);
+    stop = pos_ + size;
+  }
 
-        pos_ += len;  //consume Size field
-        //pos_ now points to start of payload
+  const long long element_size = stop - element_start;
 
-        stop = pos_ + size;
+  long long pos = pos_;
+
+  // First count number of track positions
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadUInt(pReader, pos, len);
+    assert(id >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == 0x33)  // CueTime ID
+      m_timecode = UnserializeUInt(pReader, pos, size);
+
+    else if (id == 0x37)  // CueTrackPosition(s) ID
+      ++m_track_positions_count;
+
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
+
+  assert(m_timecode >= 0);
+  assert(m_track_positions_count > 0);
+
+  // os << "CuePoint::Load(cont'd): idpos=" << idpos
+  //   << " timecode=" << m_timecode
+  //   << endl;
+
+  m_track_positions = new TrackPosition[m_track_positions_count];
+
+  // Now parse track positions
+
+  TrackPosition* p = m_track_positions;
+  pos = pos_;
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadUInt(pReader, pos, len);
+    assert(id >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == 0x37) {  // CueTrackPosition(s) ID
+      TrackPosition& tp = *p++;
+      tp.Parse(pReader, pos, size);
     }
 
-    const long long element_size = stop - element_start;
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-    long long pos = pos_;
+  assert(size_t(p - m_track_positions) == m_track_positions_count);
 
-    //First count number of track positions
-
-    while (pos < stop)
-    {
-        long len;
-
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume ID
-
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume Size field
-        assert((pos + size) <= stop);
-
-        if (id == 0x33)  //CueTime ID
-            m_timecode = UnserializeUInt(pReader, pos, size);
-
-        else if (id == 0x37) //CueTrackPosition(s) ID
-            ++m_track_positions_count;
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(m_timecode >= 0);
-    assert(m_track_positions_count > 0);
-
-    //os << "CuePoint::Load(cont'd): idpos=" << idpos
-    //   << " timecode=" << m_timecode
-    //   << endl;
-
-    m_track_positions = new TrackPosition[m_track_positions_count];
-
-    //Now parse track positions
-
-    TrackPosition* p = m_track_positions;
-    pos = pos_;
-
-    while (pos < stop)
-    {
-        long len;
-
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume ID
-
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume Size field
-        assert((pos + size) <= stop);
-
-        if (id == 0x37) //CueTrackPosition(s) ID
-        {
-            TrackPosition& tp = *p++;
-            tp.Parse(pReader, pos, size);
-        }
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(size_t(p - m_track_positions) == m_track_positions_count);
-
-    m_element_start = element_start;
-    m_element_size = element_size;
+  m_element_start = element_start;
+  m_element_size = element_size;
 }
 
+void CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+                                    long long size_) {
+  const long long stop = start_ + size_;
+  long long pos = start_;
 
+  m_track = -1;
+  m_pos = -1;
+  m_block = 1;  // default
 
-void CuePoint::TrackPosition::Parse(
-    IMkvReader* pReader,
-    long long start_,
-    long long size_)
-{
-    const long long stop = start_ + size_;
-    long long pos = start_;
+  while (pos < stop) {
+    long len;
 
-    m_track = -1;
-    m_pos = -1;
-    m_block = 1;  //default
+    const long long id = ReadUInt(pReader, pos, len);
+    assert(id >= 0);  // TODO
+    assert((pos + len) <= stop);
 
-    while (pos < stop)
-    {
-        long len;
+    pos += len;  // consume ID
 
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
 
-        pos += len;  //consume ID
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
 
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);
-        assert((pos + len) <= stop);
+    if (id == 0x77)  // CueTrack ID
+      m_track = UnserializeUInt(pReader, pos, size);
 
-        pos += len;  //consume Size field
-        assert((pos + size) <= stop);
+    else if (id == 0x71)  // CueClusterPos ID
+      m_pos = UnserializeUInt(pReader, pos, size);
 
-        if (id == 0x77)  //CueTrack ID
-            m_track = UnserializeUInt(pReader, pos, size);
+    else if (id == 0x1378)  // CueBlockNumber
+      m_block = UnserializeUInt(pReader, pos, size);
 
-        else if (id == 0x71)  //CueClusterPos ID
-            m_pos = UnserializeUInt(pReader, pos, size);
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-        else if (id == 0x1378)  //CueBlockNumber
-            m_block = UnserializeUInt(pReader, pos, size);
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(m_pos >= 0);
-    assert(m_track > 0);
-    //assert(m_block > 0);
+  assert(m_pos >= 0);
+  assert(m_track > 0);
+  // assert(m_block > 0);
 }
 
+const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
+  assert(pTrack);
 
-const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const
-{
-    assert(pTrack);
+  const long long n = pTrack->GetNumber();
 
-    const long long n = pTrack->GetNumber();
+  const TrackPosition* i = m_track_positions;
+  const TrackPosition* const j = i + m_track_positions_count;
 
-    const TrackPosition* i = m_track_positions;
-    const TrackPosition* const j = i + m_track_positions_count;
+  while (i != j) {
+    const TrackPosition& p = *i++;
 
-    while (i != j)
-    {
-        const TrackPosition& p = *i++;
+    if (p.m_track == n)
+      return &p;
+  }
 
-        if (p.m_track == n)
-            return &p;
-    }
-
-    return NULL;  //no matching track number found
+  return NULL;  // no matching track number found
 }
 
+long long CuePoint::GetTimeCode() const { return m_timecode; }
 
-long long CuePoint::GetTimeCode() const
-{
-    return m_timecode;
+long long CuePoint::GetTime(const Segment* pSegment) const {
+  assert(pSegment);
+  assert(m_timecode >= 0);
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long time = scale * m_timecode;
+
+  return time;
 }
 
-long long CuePoint::GetTime(const Segment* pSegment) const
-{
-    assert(pSegment);
-    assert(m_timecode >= 0);
-
-    const SegmentInfo* const pInfo = pSegment->GetInfo();
-    assert(pInfo);
-
-    const long long scale = pInfo->GetTimeCodeScale();
-    assert(scale >= 1);
-
-    const long long time = scale * m_timecode;
-
-    return time;
-}
-
-
 #if 0
 long long Segment::Unparsed() const
 {
@@ -3237,808 +2908,745 @@
     return result;
 }
 #else
-bool Segment::DoneParsing() const
-{
-    if (m_size < 0)
-    {
-        long long total, avail;
+bool Segment::DoneParsing() const {
+  if (m_size < 0) {
+    long long total, avail;
 
-        const int status = m_pReader->Length(&total, &avail);
+    const int status = m_pReader->Length(&total, &avail);
 
-        if (status < 0)  //error
-            return true;  //must assume done
+    if (status < 0)  // error
+      return true;  // must assume done
 
-        if (total < 0)
-            return false;  //assume live stream
+    if (total < 0)
+      return false;  // assume live stream
 
-        return (m_pos >= total);
-    }
+    return (m_pos >= total);
+  }
 
-    const long long stop = m_start + m_size;
+  const long long stop = m_start + m_size;
 
-    return (m_pos >= stop);
+  return (m_pos >= stop);
 }
 #endif
 
+const Cluster* Segment::GetFirst() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
 
-const Cluster* Segment::GetFirst() const
-{
-    if ((m_clusters == NULL) || (m_clusterCount <= 0))
-       return &m_eos;
+  Cluster* const pCluster = m_clusters[0];
+  assert(pCluster);
 
-    Cluster* const pCluster = m_clusters[0];
-    assert(pCluster);
-
-    return pCluster;
+  return pCluster;
 }
 
+const Cluster* Segment::GetLast() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
 
-const Cluster* Segment::GetLast() const
-{
-    if ((m_clusters == NULL) || (m_clusterCount <= 0))
-        return &m_eos;
+  const long idx = m_clusterCount - 1;
 
-    const long idx = m_clusterCount - 1;
+  Cluster* const pCluster = m_clusters[idx];
+  assert(pCluster);
 
-    Cluster* const pCluster = m_clusters[idx];
-    assert(pCluster);
-
-    return pCluster;
+  return pCluster;
 }
 
+unsigned long Segment::GetCount() const { return m_clusterCount; }
 
-unsigned long Segment::GetCount() const
-{
-    return m_clusterCount;
-}
+const Cluster* Segment::GetNext(const Cluster* pCurr) {
+  assert(pCurr);
+  assert(pCurr != &m_eos);
+  assert(m_clusters);
 
+  long idx = pCurr->m_index;
 
-const Cluster* Segment::GetNext(const Cluster* pCurr)
-{
-    assert(pCurr);
-    assert(pCurr != &m_eos);
-    assert(m_clusters);
+  if (idx >= 0) {
+    assert(m_clusterCount > 0);
+    assert(idx < m_clusterCount);
+    assert(pCurr == m_clusters[idx]);
 
-    long idx =  pCurr->m_index;
+    ++idx;
 
-    if (idx >= 0)
-    {
-        assert(m_clusterCount > 0);
-        assert(idx < m_clusterCount);
-        assert(pCurr == m_clusters[idx]);
+    if (idx >= m_clusterCount)
+      return &m_eos;  // caller will LoadCluster as desired
 
-        ++idx;
-
-        if (idx >= m_clusterCount)
-            return &m_eos;  //caller will LoadCluster as desired
-
-        Cluster* const pNext = m_clusters[idx];
-        assert(pNext);
-        assert(pNext->m_index >= 0);
-        assert(pNext->m_index == idx);
-
-        return pNext;
-    }
-
-    assert(m_clusterPreloadCount > 0);
-
-    long long pos = pCurr->m_element_start;
-
-    assert(m_size >= 0);  //TODO
-    const long long stop = m_start + m_size;  //end of segment
-
-    {
-        long len;
-
-        long long result = GetUIntLength(m_pReader, pos, len);
-        assert(result == 0);
-        assert((pos + len) <= stop);  //TODO
-        if (result != 0)
-            return NULL;
-
-        const long long id = ReadUInt(m_pReader, pos, len);
-        assert(id == 0x0F43B675);  //Cluster ID
-        if (id != 0x0F43B675)
-            return NULL;
-
-        pos += len;  //consume ID
-
-        //Read Size
-        result = GetUIntLength(m_pReader, pos, len);
-        assert(result == 0);  //TODO
-        assert((pos + len) <= stop);  //TODO
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-        assert(size > 0);  //TODO
-        //assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
-
-        pos += len;  //consume length of size of element
-        assert((pos + size) <= stop);  //TODO
-
-        //Pos now points to start of payload
-
-        pos += size;  //consume payload
-    }
-
-    long long off_next = 0;
-
-    while (pos < stop)
-    {
-        long len;
-
-        long long result = GetUIntLength(m_pReader, pos, len);
-        assert(result == 0);
-        assert((pos + len) <= stop);  //TODO
-        if (result != 0)
-            return NULL;
-
-        const long long idpos = pos;  //pos of next (potential) cluster
-
-        const long long id = ReadUInt(m_pReader, idpos, len);
-        assert(id > 0);  //TODO
-
-        pos += len;  //consume ID
-
-        //Read Size
-        result = GetUIntLength(m_pReader, pos, len);
-        assert(result == 0);  //TODO
-        assert((pos + len) <= stop);  //TODO
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-        assert(size >= 0);  //TODO
-
-        pos += len;  //consume length of size of element
-        assert((pos + size) <= stop);  //TODO
-
-        //Pos now points to start of payload
-
-        if (size == 0)  //weird
-            continue;
-
-        if (id == 0x0F43B675)  //Cluster ID
-        {
-            const long long off_next_ = idpos - m_start;
-
-            long long pos_;
-            long len_;
-
-            const long status = Cluster::HasBlockEntries(
-                                    this,
-                                    off_next_,
-                                    pos_,
-                                    len_);
-
-            assert(status >= 0);
-
-            if (status > 0)
-            {
-                off_next = off_next_;
-                break;
-            }
-        }
-
-        pos += size;  //consume payload
-    }
-
-    if (off_next <= 0)
-        return 0;
-
-    Cluster** const ii = m_clusters + m_clusterCount;
-    Cluster** i = ii;
-
-    Cluster** const jj = ii + m_clusterPreloadCount;
-    Cluster** j = jj;
-
-    while (i < j)
-    {
-        //INVARIANT:
-        //[0, i) < pos_next
-        //[i, j) ?
-        //[j, jj)  > pos_next
-
-        Cluster** const k = i + (j - i) / 2;
-        assert(k < jj);
-
-        Cluster* const pNext = *k;
-        assert(pNext);
-        assert(pNext->m_index < 0);
-
-        //const long long pos_ = pNext->m_pos;
-        //assert(pos_);
-        //pos = pos_ * ((pos_ < 0) ? -1 : 1);
-
-        pos = pNext->GetPosition();
-
-        if (pos < off_next)
-            i = k + 1;
-        else if (pos > off_next)
-            j = k;
-        else
-            return pNext;
-    }
-
-    assert(i == j);
-
-    Cluster* const pNext = Cluster::Create(this,
-                                          -1,
-                                          off_next);
+    Cluster* const pNext = m_clusters[idx];
     assert(pNext);
-
-    const ptrdiff_t idx_next = i - m_clusters;  //insertion position
-
-    PreloadCluster(pNext, idx_next);
-    assert(m_clusters);
-    assert(idx_next < m_clusterSize);
-    assert(m_clusters[idx_next] == pNext);
+    assert(pNext->m_index >= 0);
+    assert(pNext->m_index == idx);
 
     return pNext;
-}
+  }
 
+  assert(m_clusterPreloadCount > 0);
 
-long Segment::ParseNext(
-    const Cluster* pCurr,
-    const Cluster*& pResult,
-    long long& pos,
-    long& len)
-{
-    assert(pCurr);
-    assert(!pCurr->EOS());
-    assert(m_clusters);
+  long long pos = pCurr->m_element_start;
 
-    pResult = 0;
+  assert(m_size >= 0);  // TODO
+  const long long stop = m_start + m_size;  // end of segment
 
-    if (pCurr->m_index >= 0)  //loaded (not merely preloaded)
-    {
-        assert(m_clusters[pCurr->m_index] == pCurr);
+  {
+    long len;
 
-        const long next_idx = pCurr->m_index + 1;
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
 
-        if (next_idx < m_clusterCount)
-        {
-            pResult = m_clusters[next_idx];
-            return 0;  //success
-        }
+    const long long id = ReadUInt(m_pReader, pos, len);
+    assert(id == 0x0F43B675);  // Cluster ID
+    if (id != 0x0F43B675)
+      return NULL;
 
-        //curr cluster is last among loaded
+    pos += len;  // consume ID
 
-        const long result = LoadCluster(pos, len);
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
 
-        if (result < 0)  //error or underflow
-            return result;
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size > 0);  // TODO
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
 
-        if (result > 0)  //no more clusters
-        {
-            //pResult = &m_eos;
-            return 1;
-        }
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
 
-        pResult = GetLast();
-        return 0;  //success
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload
+  }
+
+  long long off_next = 0;
+
+  while (pos < stop) {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long idpos = pos;  // pos of next (potential) cluster
+
+    const long long id = ReadUInt(m_pReader, idpos, len);
+    assert(id > 0);  // TODO
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size >= 0);  // TODO
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == 0x0F43B675) {  // Cluster ID
+      const long long off_next_ = idpos - m_start;
+
+      long long pos_;
+      long len_;
+
+      const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_);
+
+      assert(status >= 0);
+
+      if (status > 0) {
+        off_next = off_next_;
+        break;
+      }
     }
 
-    assert(m_pos > 0);
+    pos += size;  // consume payload
+  }
 
-    long long total, avail;
+  if (off_next <= 0)
+    return 0;
 
-    long status = m_pReader->Length(&total, &avail);
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
 
-    if (status < 0)  //error
-        return status;
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
 
-    assert((total < 0) || (avail <= total));
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
 
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
 
-    //interrogate curr cluster
+    Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
 
-    pos = pCurr->m_element_start;
+    // const long long pos_ = pNext->m_pos;
+    // assert(pos_);
+    // pos = pos_ * ((pos_ < 0) ? -1 : 1);
 
-    if (pCurr->m_element_size >= 0)
-        pos += pCurr->m_element_size;
+    pos = pNext->GetPosition();
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
     else
-    {
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+      return pNext;
+  }
 
-        long long result = GetUIntLength(m_pReader, pos, len);
+  assert(i == j);
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+  Cluster* const pNext = Cluster::Create(this, -1, off_next);
+  assert(pNext);
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+  const ptrdiff_t idx_next = i - m_clusters;  // insertion position
 
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
+  PreloadCluster(pNext, idx_next);
+  assert(m_clusters);
+  assert(idx_next < m_clusterSize);
+  assert(m_clusters[idx_next] == pNext);
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long id = ReadUInt(m_pReader, pos, len);
-
-        if (id != 0x0F43B675)  //weird: not Cluster ID
-            return -1;
-
-        pos += len;  //consume ID
-
-        //Read Size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(m_pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(m_pReader, pos, len);
-
-        if (size < 0) //error
-            return static_cast<long>(size);
-
-        pos += len;  //consume size field
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size == unknown_size)          //TODO: should never happen
-            return E_FILE_FORMAT_INVALID;  //TODO: resolve this
-
-        //assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
-
-        if ((segment_stop >= 0) && ((pos + size) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        //Pos now points to start of payload
-
-        pos += size;  //consume payload (that is, the current cluster)
-        assert((segment_stop < 0) || (pos <= segment_stop));
-
-        //By consuming the payload, we are assuming that the curr
-        //cluster isn't interesting.  That is, we don't bother checking
-        //whether the payload of the curr cluster is less than what
-        //happens to be available (obtained via IMkvReader::Length).
-        //Presumably the caller has already dispensed with the current
-        //cluster, and really does want the next cluster.
-    }
-
-    //pos now points to just beyond the last fully-loaded cluster
-
-    for (;;)
-    {
-        const long status = DoParseNext(pResult, pos, len);
-
-        if (status <= 1)
-            return status;
-    }
+  return pNext;
 }
 
+long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
+                        long long& pos, long& len) {
+  assert(pCurr);
+  assert(!pCurr->EOS());
+  assert(m_clusters);
 
-long Segment::DoParseNext(
-    const Cluster*& pResult,
-    long long& pos,
-    long& len)
-{
-    long long total, avail;
+  pResult = 0;
 
-    long status = m_pReader->Length(&total, &avail);
+  if (pCurr->m_index >= 0) {  // loaded (not merely preloaded)
+    assert(m_clusters[pCurr->m_index] == pCurr);
 
-    if (status < 0)  //error
-        return status;
+    const long next_idx = pCurr->m_index + 1;
 
-    assert((total < 0) || (avail <= total));
+    if (next_idx < m_clusterCount) {
+      pResult = m_clusters[next_idx];
+      return 0;  // success
+    }
 
-    const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+    // curr cluster is last among loaded
 
-    //Parse next cluster.  This is strictly a parsing activity.
-    //Creation of a new cluster object happens later, after the
-    //parsing is done.
+    const long result = LoadCluster(pos, len);
 
-    long long off_next = 0;
-    long long cluster_size = -1;
+    if (result < 0)  // error or underflow
+      return result;
 
-    for (;;)
+    if (result > 0)  // no more clusters
     {
-        if ((total >= 0) && (pos >= total))
-            return 1;  //EOF
+      // pResult = &m_eos;
+      return 1;
+    }
 
-        if ((segment_stop >= 0) && (pos >= segment_stop))
-            return 1;  //EOF
+    pResult = GetLast();
+    return 0;  // success
+  }
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+  assert(m_pos > 0);
 
-        long long result = GetUIntLength(m_pReader, pos, len);
+  long long total, avail;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+  long status = m_pReader->Length(&total, &avail);
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+  if (status < 0)  // error
+    return status;
 
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
+  assert((total < 0) || (avail <= total));
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
 
-        const long long idpos = pos;             //absolute
-        const long long idoff = pos - m_start;   //relative
+  // interrogate curr cluster
 
-        const long long id = ReadUInt(m_pReader, idpos, len);  //absolute
+  pos = pCurr->m_element_start;
 
-        if (id < 0)  //error
-            return static_cast<long>(id);
+  if (pCurr->m_element_size >= 0)
+    pos += pCurr->m_element_size;
+  else {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-        if (id == 0)  //weird
-            return -1;  //generic error
+    long long result = GetUIntLength(m_pReader, pos, len);
 
-        pos += len;  //consume ID
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        //Read Size
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-        result = GetUIntLength(m_pReader, pos, len);
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    const long long id = ReadUInt(m_pReader, pos, len);
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+    if (id != 0x0F43B675)  // weird: not Cluster ID
+      return -1;
 
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
+    pos += len;  // consume ID
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+    // Read Size
 
-        const long long size = ReadUInt(m_pReader, pos, len);
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-        if (size < 0)  //error
-            return static_cast<long>(size);
+    result = GetUIntLength(m_pReader, pos, len);
 
-        pos += len;  //consume length of size of element
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        //Pos now points to start of payload
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        if (size == 0)  //weird
-            continue;
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-        const long long unknown_size = (1LL << (7 * len)) - 1;
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if ((segment_stop >= 0) &&
-            (size != unknown_size) &&
-            ((pos + size) > segment_stop))
-        {
-            return E_FILE_FORMAT_INVALID;
-        }
+    const long long size = ReadUInt(m_pReader, pos, len);
 
-        if (id == 0x0C53BB6B)  //Cues ID
-        {
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;
+    if (size < 0)  // error
+      return static_cast<long>(size);
 
-            const long long element_stop = pos + size;
+    pos += len;  // consume size field
 
-            if ((segment_stop >= 0) && (element_stop > segment_stop))
-                return E_FILE_FORMAT_INVALID;
+    const long long unknown_size = (1LL << (7 * len)) - 1;
 
-            const long long element_start = idpos;
-            const long long element_size = element_stop - element_start;
+    if (size == unknown_size)  // TODO: should never happen
+      return E_FILE_FORMAT_INVALID;  // TODO: resolve this
 
-            if (m_pCues == NULL)
-            {
-                m_pCues = new Cues(this,
-                                    pos,
-                                    size,
-                                    element_start,
-                                    element_size);
-                assert(m_pCues);  //TODO
-            }
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
 
-            pos += size;  //consume payload
-            assert((segment_stop < 0) || (pos <= segment_stop));
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-            continue;
-        }
+    // Pos now points to start of payload
 
-        if (id != 0x0F43B675)  //not a Cluster ID
-        {
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;
+    pos += size;  // consume payload (that is, the current cluster)
+    assert((segment_stop < 0) || (pos <= segment_stop));
 
-            pos += size;  //consume payload
-            assert((segment_stop < 0) || (pos <= segment_stop));
+    // By consuming the payload, we are assuming that the curr
+    // cluster isn't interesting.  That is, we don't bother checking
+    // whether the payload of the curr cluster is less than what
+    // happens to be available (obtained via IMkvReader::Length).
+    // Presumably the caller has already dispensed with the current
+    // cluster, and really does want the next cluster.
+  }
 
-            continue;
-        }
+  // pos now points to just beyond the last fully-loaded cluster
 
-#if 0 //this is commented-out to support incremental cluster parsing
+  for (;;) {
+    const long status = DoParseNext(pResult, pos, len);
+
+    if (status <= 1)
+      return status;
+  }
+}
+
+long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // Parse next cluster.  This is strictly a parsing activity.
+  // Creation of a new cluster object happens later, after the
+  // parsing is done.
+
+  long long off_next = 0;
+  long long cluster_size = -1;
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return 1;  // EOF
+
+    if ((segment_stop >= 0) && (pos >= segment_stop))
+      return 1;  // EOF
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;  // absolute
+    const long long idoff = pos - m_start;  // relative
+
+    const long long id = ReadUInt(m_pReader, idpos, len);  // absolute
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // weird
+      return -1;  // generic error
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == 0x0C53BB6B) {  // Cues ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_stop = pos + size;
+
+      if ((segment_stop >= 0) && (element_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_start = idpos;
+      const long long element_size = element_stop - element_start;
+
+      if (m_pCues == NULL) {
+        m_pCues = new Cues(this, pos, size, element_start, element_size);
+        assert(m_pCues);  // TODO
+      }
+
+      pos += size;  // consume payload
+      assert((segment_stop < 0) || (pos <= segment_stop));
+
+      continue;
+    }
+
+    if (id != 0x0F43B675) {  // not a Cluster ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload
+      assert((segment_stop < 0) || (pos <= segment_stop));
+
+      continue;
+    }
+
+#if 0  // this is commented-out to support incremental cluster parsing
         len = static_cast<long>(size);
 
         if (element_stop > avail)
             return E_BUFFER_NOT_FULL;
 #endif
 
-        //We have a cluster.
+    // We have a cluster.
 
-        off_next = idoff;
+    off_next = idoff;
 
-        if (size != unknown_size)
-            cluster_size = size;
+    if (size != unknown_size)
+      cluster_size = size;
 
+    break;
+  }
+
+  assert(off_next > 0);  // have cluster
+
+  // We have parsed the next cluster.
+  // We have not created a cluster object yet.  What we need
+  // to do now is determine whether it has already be preloaded
+  //(in which case, an object for this cluster has already been
+  // created), and if not, create a new cluster object.
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    const Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    pos = pNext->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else {
+      pResult = pNext;
+      return 0;  // success
+    }
+  }
+
+  assert(i == j);
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+
+  if (status < 0) {  // error or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  if (status > 0) {  // means "found at least one block entry"
+    Cluster* const pNext = Cluster::Create(this,
+                                           -1,  // preloaded
+                                           off_next);
+    // element_size);
+    assert(pNext);
+
+    const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+    PreloadCluster(pNext, idx_next);
+    assert(m_clusters);
+    assert(idx_next < m_clusterSize);
+    assert(m_clusters[idx_next] == pNext);
+
+    pResult = pNext;
+    return 0;  // success
+  }
+
+  // status == 0 means "no block entries found"
+
+  if (cluster_size < 0) {  // unknown size
+    const long long payload_pos = pos;  // absolute pos of cluster payload
+
+    for (;;) {  // determine cluster size
+      if ((total >= 0) && (pos >= total))
         break;
-    }
 
-    assert(off_next > 0);  //have cluster
+      if ((segment_stop >= 0) && (pos >= segment_stop))
+        break;  // no more clusters
 
-    //We have parsed the next cluster.
-    //We have not created a cluster object yet.  What we need
-    //to do now is determine whether it has already be preloaded
-    //(in which case, an object for this cluster has already been
-    //created), and if not, create a new cluster object.
+      // Read ID
 
-    Cluster** const ii = m_clusters + m_clusterCount;
-    Cluster** i = ii;
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
 
-    Cluster** const jj = ii + m_clusterPreloadCount;
-    Cluster** j = jj;
+      long long result = GetUIntLength(m_pReader, pos, len);
 
-    while (i < j)
-    {
-        //INVARIANT:
-        //[0, i) < pos_next
-        //[i, j) ?
-        //[j, jj)  > pos_next
+      if (result < 0)  // error
+        return static_cast<long>(result);
 
-        Cluster** const k = i + (j - i) / 2;
-        assert(k < jj);
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
 
-        const Cluster* const pNext = *k;
-        assert(pNext);
-        assert(pNext->m_index < 0);
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
 
-        pos = pNext->GetPosition();
-        assert(pos >= 0);
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
 
-        if (pos < off_next)
-            i = k + 1;
-        else if (pos > off_next)
-            j = k;
-        else
-        {
-            pResult = pNext;
-            return 0;  //success
-        }
-    }
+      const long long idpos = pos;
+      const long long id = ReadUInt(m_pReader, idpos, len);
 
-    assert(i == j);
+      if (id < 0)  // error (or underflow)
+        return static_cast<long>(id);
 
-    long long pos_;
-    long len_;
+      // This is the distinguished set of ID's we use to determine
+      // that we have exhausted the sub-element's inside the cluster
+      // whose ID we parsed earlier.
 
-    status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+      if (id == 0x0F43B675)  // Cluster ID
+        break;
 
-    if (status < 0)  //error or underflow
-    {
-        pos = pos_;
-        len = len_;
+      if (id == 0x0C53BB6B)  // Cues ID
+        break;
 
-        return status;
-    }
+      pos += len;  // consume ID (of sub-element)
 
-    if (status > 0)  //means "found at least one block entry"
-    {
-        Cluster* const pNext = Cluster::Create(this,
-                                                -1,   //preloaded
-                                                off_next);
-                                                //element_size);
-        assert(pNext);
+      // Read Size
 
-        const ptrdiff_t idx_next = i - m_clusters;  //insertion position
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
 
-        PreloadCluster(pNext, idx_next);
-        assert(m_clusters);
-        assert(idx_next < m_clusterSize);
-        assert(m_clusters[idx_next] == pNext);
+      result = GetUIntLength(m_pReader, pos, len);
 
-        pResult = pNext;
-        return 0;  //success
-    }
+      if (result < 0)  // error
+        return static_cast<long>(result);
 
-    //status == 0 means "no block entries found"
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
 
-    if (cluster_size < 0)  //unknown size
-    {
-        const long long payload_pos = pos;  //absolute pos of cluster payload
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
 
-        for (;;)  //determine cluster size
-        {
-            if ((total >= 0) && (pos >= total))
-                break;
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
 
-            if ((segment_stop >= 0) && (pos >= segment_stop))
-                break;  //no more clusters
+      const long long size = ReadUInt(m_pReader, pos, len);
 
-            //Read ID
+      if (size < 0)  // error
+        return static_cast<long>(size);
 
-            if ((pos + 1) > avail)
-            {
-                len = 1;
-                return E_BUFFER_NOT_FULL;
-            }
+      pos += len;  // consume size field of element
 
-            long long result = GetUIntLength(m_pReader, pos, len);
+      // pos now points to start of sub-element's payload
 
-            if (result < 0)  //error
-                return static_cast<long>(result);
+      if (size == 0)  // weird
+        continue;
 
-            if (result > 0)  //weird
-                return E_BUFFER_NOT_FULL;
+      const long long unknown_size = (1LL << (7 * len)) - 1;
 
-            if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-                return E_FILE_FORMAT_INVALID;
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // not allowed for sub-elements
 
-            if ((pos + len) > avail)
-                return E_BUFFER_NOT_FULL;
+      if ((segment_stop >= 0) && ((pos + size) > segment_stop))  // weird
+        return E_FILE_FORMAT_INVALID;
 
-            const long long idpos = pos;
-            const long long id = ReadUInt(m_pReader, idpos, len);
+      pos += size;  // consume payload of sub-element
+      assert((segment_stop < 0) || (pos <= segment_stop));
+    }  // determine cluster size
 
-            if (id < 0)  //error (or underflow)
-                return static_cast<long>(id);
+    cluster_size = pos - payload_pos;
+    assert(cluster_size >= 0);  // TODO: handle cluster_size = 0
 
-            //This is the distinguished set of ID's we use to determine
-            //that we have exhausted the sub-element's inside the cluster
-            //whose ID we parsed earlier.
+    pos = payload_pos;  // reset and re-parse original cluster
+  }
 
-            if (id == 0x0F43B675)  //Cluster ID
-                break;
+  pos += cluster_size;  // consume payload
+  assert((segment_stop < 0) || (pos <= segment_stop));
 
-            if (id == 0x0C53BB6B)  //Cues ID
-                break;
-
-            pos += len;  //consume ID (of sub-element)
-
-            //Read Size
-
-            if ((pos + 1) > avail)
-            {
-                len = 1;
-                return E_BUFFER_NOT_FULL;
-            }
-
-            result = GetUIntLength(m_pReader, pos, len);
-
-            if (result < 0)  //error
-                return static_cast<long>(result);
-
-            if (result > 0)  //weird
-                return E_BUFFER_NOT_FULL;
-
-            if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-                return E_FILE_FORMAT_INVALID;
-
-            if ((pos + len) > avail)
-                return E_BUFFER_NOT_FULL;
-
-            const long long size = ReadUInt(m_pReader, pos, len);
-
-            if (size < 0)  //error
-                return static_cast<long>(size);
-
-            pos += len;  //consume size field of element
-
-            //pos now points to start of sub-element's payload
-
-            if (size == 0)  //weird
-                continue;
-
-            const long long unknown_size = (1LL << (7 * len)) - 1;
-
-            if (size == unknown_size)
-                return E_FILE_FORMAT_INVALID;  //not allowed for sub-elements
-
-            if ((segment_stop >= 0) && ((pos + size) > segment_stop))  //weird
-                return E_FILE_FORMAT_INVALID;
-
-            pos += size;  //consume payload of sub-element
-            assert((segment_stop < 0) || (pos <= segment_stop));
-        }  //determine cluster size
-
-        cluster_size = pos - payload_pos;
-        assert(cluster_size >= 0);  //TODO: handle cluster_size = 0
-
-        pos = payload_pos;  //reset and re-parse original cluster
-    }
-
-    pos += cluster_size;  //consume payload
-    assert((segment_stop < 0) || (pos <= segment_stop));
-
-    return 2;             //try to find a cluster that follows next
+  return 2;  // try to find a cluster that follows next
 }
 
+const Cluster* Segment::FindCluster(long long time_ns) const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
 
-const Cluster* Segment::FindCluster(long long time_ns) const
-{
-    if ((m_clusters == NULL) || (m_clusterCount <= 0))
-        return &m_eos;
+  {
+    Cluster* const pCluster = m_clusters[0];
+    assert(pCluster);
+    assert(pCluster->m_index == 0);
 
-    {
-        Cluster* const pCluster = m_clusters[0];
-        assert(pCluster);
-        assert(pCluster->m_index == 0);
+    if (time_ns <= pCluster->GetTime())
+      return pCluster;
+  }
 
-        if (time_ns <= pCluster->GetTime())
-            return pCluster;
-    }
+  // Binary search of cluster array
 
-    //Binary search of cluster array
+  long i = 0;
+  long j = m_clusterCount;
 
-    long i = 0;
-    long j = m_clusterCount;
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) <= time_ns
+    //[i, j) ?
+    //[j, m_clusterCount)  > time_ns
 
-    while (i < j)
-    {
-        //INVARIANT:
-        //[0, i) <= time_ns
-        //[i, j) ?
-        //[j, m_clusterCount)  > time_ns
-
-        const long k = i + (j - i) / 2;
-        assert(k < m_clusterCount);
-
-        Cluster* const pCluster = m_clusters[k];
-        assert(pCluster);
-        assert(pCluster->m_index == k);
-
-        const long long t = pCluster->GetTime();
-
-        if (t <= time_ns)
-            i = k + 1;
-        else
-            j = k;
-
-        assert(i <= j);
-    }
-
-    assert(i == j);
-    assert(i > 0);
-    assert(i <= m_clusterCount);
-
-    const long k = i - 1;
+    const long k = i + (j - i) / 2;
+    assert(k < m_clusterCount);
 
     Cluster* const pCluster = m_clusters[k];
     assert(pCluster);
     assert(pCluster->m_index == k);
-    assert(pCluster->GetTime() <= time_ns);
 
-    return pCluster;
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    assert(i <= j);
+  }
+
+  assert(i == j);
+  assert(i > 0);
+  assert(i <= m_clusterCount);
+
+  const long k = i - 1;
+
+  Cluster* const pCluster = m_clusters[k];
+  assert(pCluster);
+  assert(pCluster->m_index == k);
+  assert(pCluster->GetTime() <= time_ns);
+
+  return pCluster;
 }
 
-
 #if 0
 const BlockEntry* Segment::Seek(
     long long time_ns,
@@ -4064,8 +3672,7 @@
 
     Cluster** const j = i + m_clusterCount;
 
-    if (pTrack->GetType() == 2)  //audio
-    {
+    if (pTrack->GetType() == 2) {  //audio
         //TODO: we could decide to use cues for this, as we do for video.
         //But we only use it for video because looking around for a keyframe
         //can get expensive.  Audio doesn't require anything special so a
@@ -4184,7 +3791,6 @@
 }
 #endif
 
-
 #if 0
 bool Segment::SearchCues(
     long long time_ns,
@@ -4215,844 +3821,592 @@
 }
 #endif
 
+const Tracks* Segment::GetTracks() const { return m_pTracks; }
 
-const Tracks* Segment::GetTracks() const
-{
-    return m_pTracks;
+const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
+
+const Cues* Segment::GetCues() const { return m_pCues; }
+
+const Chapters* Segment::GetChapters() const { return m_pChapters; }
+
+const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
+
+long long Segment::GetDuration() const {
+  assert(m_pInfo);
+  return m_pInfo->GetDuration();
 }
 
+Chapters::Chapters(Segment* pSegment, long long payload_start,
+                   long long payload_size, long long element_start,
+                   long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_editions(NULL),
+      m_editions_size(0),
+      m_editions_count(0) {}
 
-const SegmentInfo* Segment::GetInfo() const
-{
-    return m_pInfo;
+Chapters::~Chapters() {
+  while (m_editions_count > 0) {
+    Edition& e = m_editions[--m_editions_count];
+    e.Clear();
+  }
 }
 
+long Chapters::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
-const Cues* Segment::GetCues() const
-{
-    return m_pCues;
-}
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
 
+  while (pos < stop) {
+    long long id, size;
 
-const Chapters* Segment::GetChapters() const
-{
-  return m_pChapters;
-}
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
 
+    if (status < 0)  // error
+      return status;
 
-const SeekHead* Segment::GetSeekHead() const
-{
-    return m_pSeekHead;
-}
+    if (size == 0)  // weird
+      continue;
 
+    if (id == 0x05B9) {  // EditionEntry ID
+      status = ParseEdition(pos, size);
 
-long long Segment::GetDuration() const
-{
-    assert(m_pInfo);
-    return m_pInfo->GetDuration();
-}
-
-
-Chapters::Chapters(
-    Segment* pSegment,
-    long long payload_start,
-    long long payload_size,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_start(payload_start),
-    m_size(payload_size),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    m_editions(NULL),
-    m_editions_size(0),
-    m_editions_count(0)
-{
-}
-
-
-Chapters::~Chapters()
-{
-    while (m_editions_count > 0)
-    {
-        Edition& e = m_editions[--m_editions_count];
-        e.Clear();
-    }
-}
-
-
-long Chapters::Parse()
-{
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long pos = m_start;  // payload start
-    const long long stop = pos + m_size;  // payload stop
-
-    while (pos < stop)
-    {
-        long long id, size;
-
-        long status = ParseElementHeader(
-                        pReader,
-                        pos,
-                        stop,
-                        id,
-                        size);
-
-        if (status < 0)  // error
-            return status;
-
-        if (size == 0)  // weird
-            continue;
-
-        if (id == 0x05B9)  // EditionEntry ID
-        {
-            status = ParseEdition(pos, size);
-
-            if (status < 0)  // error
-                return status;
-        }
-
-        pos += size;
-        assert(pos <= stop);
+      if (status < 0)  // error
+        return status;
     }
 
-    assert(pos == stop);
-    return 0;
+    pos += size;
+    assert(pos <= stop);
+  }
+
+  assert(pos == stop);
+  return 0;
 }
 
+int Chapters::GetEditionCount() const { return m_editions_count; }
 
-int Chapters::GetEditionCount() const
-{
-    return m_editions_count;
+const Chapters::Edition* Chapters::GetEdition(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_editions_count)
+    return NULL;
+
+  return m_editions + idx;
 }
 
+bool Chapters::ExpandEditionsArray() {
+  if (m_editions_size > m_editions_count)
+    return true;  // nothing else to do
 
-const Chapters::Edition* Chapters::GetEdition(int idx) const
-{
-    if (idx < 0)
-        return NULL;
+  const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
 
-    if (idx >= m_editions_count)
-        return NULL;
+  Edition* const editions = new (std::nothrow) Edition[size];
 
-    return m_editions + idx;
+  if (editions == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_editions_count; ++idx) {
+    m_editions[idx].ShallowCopy(editions[idx]);
+  }
+
+  delete[] m_editions;
+  m_editions = editions;
+
+  m_editions_size = size;
+  return true;
 }
 
+long Chapters::ParseEdition(long long pos, long long size) {
+  if (!ExpandEditionsArray())
+    return -1;
 
-bool Chapters::ExpandEditionsArray()
-{
-    if (m_editions_size > m_editions_count)
-        return true;  // nothing else to do
+  Edition& e = m_editions[m_editions_count++];
+  e.Init();
 
-    const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
+  return e.Parse(m_pSegment->m_pReader, pos, size);
+}
 
-    Edition* const editions = new (std::nothrow) Edition[size];
+Chapters::Edition::Edition() {}
 
-    if (editions == NULL)
-        return false;
+Chapters::Edition::~Edition() {}
 
-    for (int idx = 0; idx < m_editions_count; ++idx)
-    {
-        m_editions[idx].ShallowCopy(editions[idx]);
+int Chapters::Edition::GetAtomCount() const { return m_atoms_count; }
+
+const Chapters::Atom* Chapters::Edition::GetAtom(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_atoms_count)
+    return NULL;
+
+  return m_atoms + index;
+}
+
+void Chapters::Edition::Init() {
+  m_atoms = NULL;
+  m_atoms_size = 0;
+  m_atoms_count = 0;
+}
+
+void Chapters::Edition::ShallowCopy(Edition& rhs) const {
+  rhs.m_atoms = m_atoms;
+  rhs.m_atoms_size = m_atoms_size;
+  rhs.m_atoms_count = m_atoms_count;
+}
+
+void Chapters::Edition::Clear() {
+  while (m_atoms_count > 0) {
+    Atom& a = m_atoms[--m_atoms_count];
+    a.Clear();
+  }
+
+  delete[] m_atoms;
+  m_atoms = NULL;
+
+  m_atoms_size = 0;
+}
+
+long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == 0x36) {  // Atom ID
+      status = ParseAtom(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
     }
 
-    delete[] m_editions;
-    m_editions = editions;
+    pos += size;
+    assert(pos <= stop);
+  }
 
-    m_editions_size = size;
-    return true;
+  assert(pos == stop);
+  return 0;
 }
 
+long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandAtomsArray())
+    return -1;
 
-long Chapters::ParseEdition(
-    long long pos,
-    long long size)
-{
-    if (!ExpandEditionsArray())
-        return -1;
+  Atom& a = m_atoms[m_atoms_count++];
+  a.Init();
 
-    Edition& e = m_editions[m_editions_count++];
-    e.Init();
-
-    return e.Parse(m_pSegment->m_pReader, pos, size);
+  return a.Parse(pReader, pos, size);
 }
 
+bool Chapters::Edition::ExpandAtomsArray() {
+  if (m_atoms_size > m_atoms_count)
+    return true;  // nothing else to do
 
-Chapters::Edition::Edition()
-{
+  const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+
+  Atom* const atoms = new (std::nothrow) Atom[size];
+
+  if (atoms == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_atoms_count; ++idx) {
+    m_atoms[idx].ShallowCopy(atoms[idx]);
+  }
+
+  delete[] m_atoms;
+  m_atoms = atoms;
+
+  m_atoms_size = size;
+  return true;
 }
 
+Chapters::Atom::Atom() {}
 
-Chapters::Edition::~Edition()
-{
+Chapters::Atom::~Atom() {}
+
+unsigned long long Chapters::Atom::GetUID() const { return m_uid; }
+
+const char* Chapters::Atom::GetStringUID() const { return m_string_uid; }
+
+long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; }
+
+long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; }
+
+long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_start_timecode);
 }
 
-
-int Chapters::Edition::GetAtomCount() const
-{
-    return m_atoms_count;
+long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_stop_timecode);
 }
 
+int Chapters::Atom::GetDisplayCount() const { return m_displays_count; }
 
-const Chapters::Atom* Chapters::Edition::GetAtom(int index) const
-{
-    if (index < 0)
-        return NULL;
+const Chapters::Display* Chapters::Atom::GetDisplay(int index) const {
+  if (index < 0)
+    return NULL;
 
-    if (index >= m_atoms_count)
-        return NULL;
+  if (index >= m_displays_count)
+    return NULL;
 
-    return m_atoms + index;
+  return m_displays + index;
 }
 
+void Chapters::Atom::Init() {
+  m_string_uid = NULL;
+  m_uid = 0;
+  m_start_timecode = -1;
+  m_stop_timecode = -1;
 
-void Chapters::Edition::Init()
-{
-    m_atoms = NULL;
-    m_atoms_size = 0;
-    m_atoms_count = 0;
+  m_displays = NULL;
+  m_displays_size = 0;
+  m_displays_count = 0;
 }
 
+void Chapters::Atom::ShallowCopy(Atom& rhs) const {
+  rhs.m_string_uid = m_string_uid;
+  rhs.m_uid = m_uid;
+  rhs.m_start_timecode = m_start_timecode;
+  rhs.m_stop_timecode = m_stop_timecode;
 
-void Chapters::Edition::ShallowCopy(Edition& rhs) const
-{
-    rhs.m_atoms = m_atoms;
-    rhs.m_atoms_size = m_atoms_size;
-    rhs.m_atoms_count = m_atoms_count;
+  rhs.m_displays = m_displays;
+  rhs.m_displays_size = m_displays_size;
+  rhs.m_displays_count = m_displays_count;
 }
 
+void Chapters::Atom::Clear() {
+  delete[] m_string_uid;
+  m_string_uid = NULL;
 
-void Chapters::Edition::Clear()
-{
-    while (m_atoms_count > 0)
-    {
-        Atom& a = m_atoms[--m_atoms_count];
-        a.Clear();
+  while (m_displays_count > 0) {
+    Display& d = m_displays[--m_displays_count];
+    d.Clear();
+  }
+
+  delete[] m_displays;
+  m_displays = NULL;
+
+  m_displays_size = 0;
+}
+
+long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == 0x00) {  // Display ID
+      status = ParseDisplay(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == 0x1654) {  // StringUID ID
+      status = UnserializeString(pReader, pos, size, m_string_uid);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == 0x33C4) {  // UID ID
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_uid = val;
+    } else if (id == 0x11) {  // TimeStart ID
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_start_timecode = val;
+    } else if (id == 0x12) {  // TimeEnd ID
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_stop_timecode = val;
     }
 
-    delete[] m_atoms;
-    m_atoms = NULL;
+    pos += size;
+    assert(pos <= stop);
+  }
 
-    m_atoms_size = 0;
+  assert(pos == stop);
+  return 0;
 }
 
+long long Chapters::Atom::GetTime(const Chapters* pChapters,
+                                  long long timecode) {
+  if (pChapters == NULL)
+    return -1;
 
-long Chapters::Edition::Parse(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    const long long stop = pos + size;
+  Segment* const pSegment = pChapters->m_pSegment;
 
-    while (pos < stop)
-    {
-        long long id, size;
+  if (pSegment == NULL)  // weird
+    return -1;
 
-        long status = ParseElementHeader(
-                        pReader,
-                        pos,
-                        stop,
-                        id,
-                        size);
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
 
-        if (status < 0)  // error
-            return status;
+  if (pInfo == NULL)
+    return -1;
 
-        if (size == 0)  // weird
-            continue;
+  const long long timecode_scale = pInfo->GetTimeCodeScale();
 
-        if (id == 0x36)  // Atom ID
-        {
-            status = ParseAtom(pReader, pos, size);
+  if (timecode_scale < 1)  // weird
+    return -1;
 
-            if (status < 0)  // error
-                return status;
-        }
+  if (timecode < 0)
+    return -1;
 
-        pos += size;
-        assert(pos <= stop);
+  const long long result = timecode_scale * timecode;
+
+  return result;
+}
+
+long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandDisplaysArray())
+    return -1;
+
+  Display& d = m_displays[m_displays_count++];
+  d.Init();
+
+  return d.Parse(pReader, pos, size);
+}
+
+bool Chapters::Atom::ExpandDisplaysArray() {
+  if (m_displays_size > m_displays_count)
+    return true;  // nothing else to do
+
+  const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
+
+  Display* const displays = new (std::nothrow) Display[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_displays_count; ++idx) {
+    m_displays[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_displays;
+  m_displays = displays;
+
+  m_displays_size = size;
+  return true;
+}
+
+Chapters::Display::Display() {}
+
+Chapters::Display::~Display() {}
+
+const char* Chapters::Display::GetString() const { return m_string; }
+
+const char* Chapters::Display::GetLanguage() const { return m_language; }
+
+const char* Chapters::Display::GetCountry() const { return m_country; }
+
+void Chapters::Display::Init() {
+  m_string = NULL;
+  m_language = NULL;
+  m_country = NULL;
+}
+
+void Chapters::Display::ShallowCopy(Display& rhs) const {
+  rhs.m_string = m_string;
+  rhs.m_language = m_language;
+  rhs.m_country = m_country;
+}
+
+void Chapters::Display::Clear() {
+  delete[] m_string;
+  m_string = NULL;
+
+  delete[] m_language;
+  m_language = NULL;
+
+  delete[] m_country;
+  m_country = NULL;
+}
+
+long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == 0x05) {  // ChapterString ID
+      status = UnserializeString(pReader, pos, size, m_string);
+
+      if (status)
+        return status;
+    } else if (id == 0x037C) {  // ChapterLanguage ID
+      status = UnserializeString(pReader, pos, size, m_language);
+
+      if (status)
+        return status;
+    } else if (id == 0x037E) {  // ChapterCountry ID
+      status = UnserializeString(pReader, pos, size, m_country);
+
+      if (status)
+        return status;
     }
 
-    assert(pos == stop);
-    return 0;
+    pos += size;
+    assert(pos <= stop);
+  }
+
+  assert(pos == stop);
+  return 0;
 }
 
+SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
+                         long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_pMuxingAppAsUTF8(NULL),
+      m_pWritingAppAsUTF8(NULL),
+      m_pTitleAsUTF8(NULL) {}
 
-long Chapters::Edition::ParseAtom(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    if (!ExpandAtomsArray())
-        return -1;
+SegmentInfo::~SegmentInfo() {
+  delete[] m_pMuxingAppAsUTF8;
+  m_pMuxingAppAsUTF8 = NULL;
 
-    Atom& a = m_atoms[m_atoms_count++];
-    a.Init();
+  delete[] m_pWritingAppAsUTF8;
+  m_pWritingAppAsUTF8 = NULL;
 
-    return a.Parse(pReader, pos, size);
+  delete[] m_pTitleAsUTF8;
+  m_pTitleAsUTF8 = NULL;
 }
 
+long SegmentInfo::Parse() {
+  assert(m_pMuxingAppAsUTF8 == NULL);
+  assert(m_pWritingAppAsUTF8 == NULL);
+  assert(m_pTitleAsUTF8 == NULL);
 
-bool Chapters::Edition::ExpandAtomsArray()
-{
-    if (m_atoms_size > m_atoms_count)
-        return true;  // nothing else to do
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
-    const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
 
-    Atom* const atoms = new (std::nothrow) Atom[size];
+  m_timecodeScale = 1000000;
+  m_duration = -1;
 
-    if (atoms == NULL)
-        return false;
+  while (pos < stop) {
+    long long id, size;
 
-    for (int idx = 0; idx < m_atoms_count; ++idx)
-    {
-        m_atoms[idx].ShallowCopy(atoms[idx]);
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == 0x0AD7B1) {  // Timecode Scale
+      m_timecodeScale = UnserializeUInt(pReader, pos, size);
+
+      if (m_timecodeScale <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x0489) {  // Segment duration
+      const long status = UnserializeFloat(pReader, pos, size, m_duration);
+
+      if (status < 0)
+        return status;
+
+      if (m_duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x0D80) {  // MuxingApp
+      const long status =
+          UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == 0x1741) {  // WritingApp
+      const long status =
+          UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == 0x3BA9) {  // Title
+      const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
+
+      if (status)
+        return status;
     }
 
-    delete[] m_atoms;
-    m_atoms = atoms;
+    pos += size;
+    assert(pos <= stop);
+  }
 
-    m_atoms_size = size;
-    return true;
+  assert(pos == stop);
+
+  return 0;
 }
 
+long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; }
 
-Chapters::Atom::Atom()
-{
+long long SegmentInfo::GetDuration() const {
+  if (m_duration < 0)
+    return -1;
+
+  assert(m_timecodeScale >= 1);
+
+  const double dd = double(m_duration) * double(m_timecodeScale);
+  const long long d = static_cast<long long>(dd);
+
+  return d;
 }
 
-
-Chapters::Atom::~Atom()
-{
+const char* SegmentInfo::GetMuxingAppAsUTF8() const {
+  return m_pMuxingAppAsUTF8;
 }
 
-
-unsigned long long Chapters::Atom::GetUID() const
-{
-    return m_uid;
+const char* SegmentInfo::GetWritingAppAsUTF8() const {
+  return m_pWritingAppAsUTF8;
 }
 
-
-const char* Chapters::Atom::GetStringUID() const
-{
-    return m_string_uid;
-}
-
-
-long long Chapters::Atom::GetStartTimecode() const
-{
-    return m_start_timecode;
-}
-
-
-long long Chapters::Atom::GetStopTimecode() const
-{
-    return m_stop_timecode;
-}
-
-
-long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const
-{
-    return GetTime(pChapters, m_start_timecode);
-}
-
-
-long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const
-{
-    return GetTime(pChapters, m_stop_timecode);
-}
-
-
-int Chapters::Atom::GetDisplayCount() const
-{
-    return m_displays_count;
-}
-
-
-const Chapters::Display* Chapters::Atom::GetDisplay(int index) const
-{
-    if (index < 0)
-        return NULL;
-
-    if (index >= m_displays_count)
-        return NULL;
-
-    return m_displays + index;
-}
-
-
-void Chapters::Atom::Init()
-{
-    m_string_uid = NULL;
-    m_uid = 0;
-    m_start_timecode = -1;
-    m_stop_timecode = -1;
-
-    m_displays = NULL;
-    m_displays_size = 0;
-    m_displays_count = 0;
-}
-
-
-void Chapters::Atom::ShallowCopy(Atom& rhs) const
-{
-    rhs.m_string_uid = m_string_uid;
-    rhs.m_uid = m_uid;
-    rhs.m_start_timecode = m_start_timecode;
-    rhs.m_stop_timecode = m_stop_timecode;
-
-    rhs.m_displays = m_displays;
-    rhs.m_displays_size = m_displays_size;
-    rhs.m_displays_count = m_displays_count;
-}
-
-
-void Chapters::Atom::Clear()
-{
-    delete[] m_string_uid;
-    m_string_uid = NULL;
-
-    while (m_displays_count > 0)
-    {
-        Display& d = m_displays[--m_displays_count];
-        d.Clear();
-    }
-
-    delete[] m_displays;
-    m_displays = NULL;
-
-    m_displays_size = 0;
-}
-
-
-long Chapters::Atom::Parse(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    const long long stop = pos + size;
-
-    while (pos < stop)
-    {
-        long long id, size;
-
-        long status = ParseElementHeader(
-                        pReader,
-                        pos,
-                        stop,
-                        id,
-                        size);
-
-        if (status < 0)  // error
-            return status;
-
-        if (size == 0)  // weird
-            continue;
-
-        if (id == 0x00)  // Display ID
-        {
-            status = ParseDisplay(pReader, pos, size);
-
-            if (status < 0)  // error
-                return status;
-        }
-        else if (id == 0x1654)  // StringUID ID
-        {
-            status = UnserializeString(pReader, pos, size, m_string_uid);
-
-            if (status < 0)  // error
-                return status;
-        }
-        else if (id == 0x33C4)  // UID ID
-        {
-            const long long val = UnserializeUInt(pReader, pos, size);
-
-            if (val < 0)  // error
-                return static_cast<long>(val);
-
-            m_uid = val;
-        }
-        else if (id == 0x11)  // TimeStart ID
-        {
-            const long long val = UnserializeUInt(pReader, pos, size);
-
-            if (val < 0)  // error
-                return static_cast<long>(val);
-
-            m_start_timecode = val;
-        }
-        else if (id == 0x12)  // TimeEnd ID
-        {
-            const long long val = UnserializeUInt(pReader, pos, size);
-
-            if (val < 0)  // error
-                return static_cast<long>(val);
-
-            m_stop_timecode = val;
-        }
-
-        pos += size;
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-    return 0;
-}
-
-
-long long Chapters::Atom::GetTime(
-    const Chapters* pChapters,
-    long long timecode)
-{
-    if (pChapters == NULL)
-        return -1;
-
-    Segment* const pSegment = pChapters->m_pSegment;
-
-    if (pSegment == NULL)  // weird
-        return -1;
-
-    const SegmentInfo* const pInfo = pSegment->GetInfo();
-
-    if (pInfo == NULL)
-        return -1;
-
-    const long long timecode_scale = pInfo->GetTimeCodeScale();
-
-    if (timecode_scale < 1)  // weird
-        return -1;
-
-    if (timecode < 0)
-        return -1;
-
-    const long long result = timecode_scale * timecode;
-
-    return result;
-}
-
-
-long Chapters::Atom::ParseDisplay(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    if (!ExpandDisplaysArray())
-        return -1;
-
-    Display& d = m_displays[m_displays_count++];
-    d.Init();
-
-    return d.Parse(pReader, pos, size);
-}
-
-
-bool Chapters::Atom::ExpandDisplaysArray()
-{
-    if (m_displays_size > m_displays_count)
-        return true;  // nothing else to do
-
-    const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
-
-    Display* const displays = new (std::nothrow) Display[size];
-
-    if (displays == NULL)
-        return false;
-
-    for (int idx = 0; idx < m_displays_count; ++idx)
-    {
-        m_displays[idx].ShallowCopy(displays[idx]);
-    }
-
-    delete[] m_displays;
-    m_displays = displays;
-
-    m_displays_size = size;
-    return true;
-}
-
-
-Chapters::Display::Display()
-{
-}
-
-
-Chapters::Display::~Display()
-{
-}
-
-
-const char* Chapters::Display::GetString() const
-{
-    return m_string;
-}
-
-
-const char* Chapters::Display::GetLanguage() const
-{
-    return m_language;
-}
-
-
-const char* Chapters::Display::GetCountry() const
-{
-    return m_country;
-}
-
-
-void Chapters::Display::Init()
-{
-    m_string = NULL;
-    m_language = NULL;
-    m_country = NULL;
-}
-
-
-void Chapters::Display::ShallowCopy(Display& rhs) const
-{
-    rhs.m_string = m_string;
-    rhs.m_language = m_language;
-    rhs.m_country = m_country;
-}
-
-
-void Chapters::Display::Clear()
-{
-    delete[] m_string;
-    m_string = NULL;
-
-    delete[] m_language;
-    m_language = NULL;
-
-    delete[] m_country;
-    m_country = NULL;
-}
-
-
-long Chapters::Display::Parse(
-    IMkvReader* pReader,
-    long long pos,
-    long long size)
-{
-    const long long stop = pos + size;
-
-    while (pos < stop)
-    {
-        long long id, size;
-
-        long status = ParseElementHeader(
-                        pReader,
-                        pos,
-                        stop,
-                        id,
-                        size);
-
-        if (status < 0)  // error
-            return status;
-
-        if (size == 0)  // weird
-            continue;
-
-        if (id == 0x05)  // ChapterString ID
-        {
-            status = UnserializeString(pReader, pos, size, m_string);
-
-            if (status)
-              return status;
-        }
-        else if (id == 0x037C)  // ChapterLanguage ID
-        {
-            status = UnserializeString(pReader, pos, size, m_language);
-
-            if (status)
-              return status;
-        }
-        else if (id == 0x037E)  // ChapterCountry ID
-        {
-            status = UnserializeString(pReader, pos, size, m_country);
-
-            if (status)
-              return status;
-        }
-
-        pos += size;
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-    return 0;
-}
-
-
-SegmentInfo::SegmentInfo(
-    Segment* pSegment,
-    long long start,
-    long long size_,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_start(start),
-    m_size(size_),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    m_pMuxingAppAsUTF8(NULL),
-    m_pWritingAppAsUTF8(NULL),
-    m_pTitleAsUTF8(NULL)
-{
-}
-
-SegmentInfo::~SegmentInfo()
-{
-    delete[] m_pMuxingAppAsUTF8;
-    m_pMuxingAppAsUTF8 = NULL;
-
-    delete[] m_pWritingAppAsUTF8;
-    m_pWritingAppAsUTF8 = NULL;
-
-    delete[] m_pTitleAsUTF8;
-    m_pTitleAsUTF8 = NULL;
-}
-
-
-long SegmentInfo::Parse()
-{
-    assert(m_pMuxingAppAsUTF8 == NULL);
-    assert(m_pWritingAppAsUTF8 == NULL);
-    assert(m_pTitleAsUTF8 == NULL);
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long pos = m_start;
-    const long long stop = m_start + m_size;
-
-    m_timecodeScale = 1000000;
-    m_duration = -1;
-
-    while (pos < stop)
-    {
-        long long id, size;
-
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
-
-        if (status < 0)  //error
-            return status;
-
-        if (id == 0x0AD7B1)  //Timecode Scale
-        {
-            m_timecodeScale = UnserializeUInt(pReader, pos, size);
-
-            if (m_timecodeScale <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x0489)  //Segment duration
-        {
-            const long status = UnserializeFloat(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    m_duration);
-
-            if (status < 0)
-                return status;
-
-            if (m_duration < 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x0D80)  //MuxingApp
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    m_pMuxingAppAsUTF8);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x1741)  //WritingApp
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    m_pWritingAppAsUTF8);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x3BA9)  //Title
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    m_pTitleAsUTF8);
-
-            if (status)
-                return status;
-        }
-
-        pos += size;
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-
-    return 0;
-}
-
-
-long long SegmentInfo::GetTimeCodeScale() const
-{
-    return m_timecodeScale;
-}
-
-
-long long SegmentInfo::GetDuration() const
-{
-    if (m_duration < 0)
-        return -1;
-
-    assert(m_timecodeScale >= 1);
-
-    const double dd = double(m_duration) * double(m_timecodeScale);
-    const long long d = static_cast<long long>(dd);
-
-    return d;
-}
-
-const char* SegmentInfo::GetMuxingAppAsUTF8() const
-{
-    return m_pMuxingAppAsUTF8;
-}
-
-
-const char* SegmentInfo::GetWritingAppAsUTF8() const
-{
-    return m_pWritingAppAsUTF8;
-}
-
-const char* SegmentInfo::GetTitleAsUTF8() const
-{
-    return m_pTitleAsUTF8;
-}
+const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; }
 
 ///////////////////////////////////////////////////////////////
 // ContentEncoding element
 ContentEncoding::ContentCompression::ContentCompression()
-    : algo(0),
-      settings(NULL),
-      settings_len(0) {
-}
+    : algo(0), settings(NULL), settings_len(0) {}
 
 ContentEncoding::ContentCompression::~ContentCompression() {
-  delete [] settings;
+  delete[] settings;
 }
 
 ContentEncoding::ContentEncryption::ContentEncryption()
@@ -5064,13 +4418,12 @@
       sig_key_id(NULL),
       sig_key_id_len(0),
       sig_algo(0),
-      sig_hash_algo(0) {
-}
+      sig_hash_algo(0) {}
 
 ContentEncoding::ContentEncryption::~ContentEncryption() {
-  delete [] key_id;
-  delete [] signature;
-  delete [] sig_key_id;
+  delete[] key_id;
+  delete[] signature;
+  delete[] sig_key_id;
 }
 
 ContentEncoding::ContentEncoding()
@@ -5080,8 +4433,7 @@
       encryption_entries_end_(NULL),
       encoding_order_(0),
       encoding_scope_(1),
-      encoding_type_(0) {
-}
+      encoding_type_(0) {}
 
 ContentEncoding::~ContentEncoding() {
   ContentCompression** comp_i = compression_entries_;
@@ -5092,7 +4444,7 @@
     delete comp;
   }
 
-  delete [] compression_entries_;
+  delete[] compression_entries_;
 
   ContentEncryption** enc_i = encryption_entries_;
   ContentEncryption** const enc_j = encryption_entries_end_;
@@ -5102,10 +4454,9 @@
     delete enc;
   }
 
-  delete [] encryption_entries_;
+  delete[] encryption_entries_;
 }
 
-
 const ContentEncoding::ContentCompression*
 ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
   const ptrdiff_t count = compression_entries_end_ - compression_entries_;
@@ -5124,8 +4475,8 @@
   return static_cast<unsigned long>(count);
 }
 
-const ContentEncoding::ContentEncryption*
-ContentEncoding::GetEncryptionByIndex(unsigned long idx) const {
+const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex(
+    unsigned long idx) const {
   const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
   assert(count >= 0);
 
@@ -5143,9 +4494,7 @@
 }
 
 long ContentEncoding::ParseContentEncAESSettingsEntry(
-    long long start,
-    long long size,
-    IMkvReader* pReader,
+    long long start, long long size, IMkvReader* pReader,
     ContentEncAESSettings* aes) {
   assert(pReader);
   assert(aes);
@@ -5155,12 +4504,8 @@
 
   while (pos < stop) {
     long long id, size;
-    const long status = ParseElementHeader(pReader,
-                                           pos,
-                                           stop,
-                                           id,
-                                           size);
-    if (status < 0)  //error
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
     if (id == 0x7E8) {
@@ -5170,15 +4515,14 @@
         return E_FILE_FORMAT_INVALID;
     }
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
   return 0;
 }
 
-long ContentEncoding::ParseContentEncodingEntry(long long start,
-                                                long long size,
+long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
                                                 IMkvReader* pReader) {
   assert(pReader);
 
@@ -5191,12 +4535,8 @@
 
   while (pos < stop) {
     long long id, size;
-    const long status = ParseElementHeader(pReader,
-                                           pos,
-                                           stop,
-                                           id,
-                                           size);
-    if (status < 0)  //error
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
     if (id == 0x1034)  // ContentCompression ID
@@ -5205,7 +4545,7 @@
     if (id == 0x1035)  // ContentEncryption ID
       ++encryption_count;
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
@@ -5214,7 +4554,7 @@
 
   if (compression_count > 0) {
     compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+        new (std::nothrow) ContentCompression* [compression_count];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
@@ -5222,9 +4562,9 @@
 
   if (encryption_count > 0) {
     encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+        new (std::nothrow) ContentEncryption* [encryption_count];
     if (!encryption_entries_) {
-      delete [] compression_entries_;
+      delete[] compression_entries_;
       return -1;
     }
     encryption_entries_end_ = encryption_entries_;
@@ -5233,12 +4573,8 @@
   pos = start;
   while (pos < stop) {
     long long id, size;
-    long status = ParseElementHeader(pReader,
-                                     pos,
-                                     stop,
-                                     id,
-                                     size);
-    if (status < 0)  //error
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
     if (id == 0x1031) {
@@ -5255,7 +4591,7 @@
     } else if (id == 0x1034) {
       // ContentCompression ID
       ContentCompression* const compression =
-        new (std::nothrow) ContentCompression();
+          new (std::nothrow) ContentCompression();
       if (!compression)
         return -1;
 
@@ -5280,7 +4616,7 @@
       *encryption_entries_end_++ = encryption;
     }
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
@@ -5288,11 +4624,9 @@
   return 0;
 }
 
-long ContentEncoding::ParseCompressionEntry(
-    long long start,
-    long long size,
-    IMkvReader* pReader,
-    ContentCompression* compression) {
+long ContentEncoding::ParseCompressionEntry(long long start, long long size,
+                                            IMkvReader* pReader,
+                                            ContentCompression* compression) {
   assert(pReader);
   assert(compression);
 
@@ -5303,12 +4637,8 @@
 
   while (pos < stop) {
     long long id, size;
-    const long status = ParseElementHeader(pReader,
-                                           pos,
-                                           stop,
-                                           id,
-                                           size);
-    if (status < 0)  //error
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
     if (id == 0x254) {
@@ -5329,9 +4659,10 @@
       if (buf == NULL)
         return -1;
 
-      const int read_status = pReader->Read(pos, buflen, buf);
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
       if (read_status) {
-        delete [] buf;
+        delete[] buf;
         return status;
       }
 
@@ -5339,7 +4670,7 @@
       compression->settings_len = buflen;
     }
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
@@ -5350,11 +4681,9 @@
   return 0;
 }
 
-long ContentEncoding::ParseEncryptionEntry(
-    long long start,
-    long long size,
-    IMkvReader* pReader,
-    ContentEncryption* encryption) {
+long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
+                                           IMkvReader* pReader,
+                                           ContentEncryption* encryption) {
   assert(pReader);
   assert(encryption);
 
@@ -5363,12 +4692,8 @@
 
   while (pos < stop) {
     long long id, size;
-    const long status = ParseElementHeader(pReader,
-                                           pos,
-                                           stop,
-                                           id,
-                                           size);
-    if (status < 0)  //error
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
     if (id == 0x7E1) {
@@ -5378,7 +4703,7 @@
         return E_FILE_FORMAT_INVALID;
     } else if (id == 0x7E2) {
       // ContentEncKeyID
-      delete[] encryption->key_id;
+      delete[] encryption -> key_id;
       encryption->key_id = NULL;
       encryption->key_id_len = 0;
 
@@ -5391,9 +4716,10 @@
       if (buf == NULL)
         return -1;
 
-      const int read_status = pReader->Read(pos, buflen, buf);
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
       if (read_status) {
-        delete [] buf;
+        delete[] buf;
         return status;
       }
 
@@ -5401,7 +4727,7 @@
       encryption->key_id_len = buflen;
     } else if (id == 0x7E3) {
       // ContentSignature
-      delete[] encryption->signature;
+      delete[] encryption -> signature;
       encryption->signature = NULL;
       encryption->signature_len = 0;
 
@@ -5414,9 +4740,10 @@
       if (buf == NULL)
         return -1;
 
-      const int read_status = pReader->Read(pos, buflen, buf);
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
       if (read_status) {
-        delete [] buf;
+        delete[] buf;
         return status;
       }
 
@@ -5424,7 +4751,7 @@
       encryption->signature_len = buflen;
     } else if (id == 0x7E4) {
       // ContentSigKeyID
-      delete[] encryption->sig_key_id;
+      delete[] encryption -> sig_key_id;
       encryption->sig_key_id = NULL;
       encryption->sig_key_id_len = 0;
 
@@ -5437,9 +4764,10 @@
       if (buf == NULL)
         return -1;
 
-      const int read_status = pReader->Read(pos, buflen, buf);
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
       if (read_status) {
-        delete [] buf;
+        delete[] buf;
         return status;
       }
 
@@ -5454,400 +4782,322 @@
     } else if (id == 0x7E7) {
       // ContentEncAESSettings
       const long status = ParseContentEncAESSettingsEntry(
-          pos,
-          size,
-          pReader,
-          &encryption->aes_settings);
+          pos, size, pReader, &encryption->aes_settings);
       if (status)
         return status;
     }
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
   return 0;
 }
 
-Track::Track(
-    Segment* pSegment,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    content_encoding_entries_(NULL),
-    content_encoding_entries_end_(NULL)
-{
+Track::Track(Segment* pSegment, long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_end_(NULL) {}
+
+Track::~Track() {
+  Info& info = const_cast<Info&>(m_info);
+  info.Clear();
+
+  ContentEncoding** i = content_encoding_entries_;
+  ContentEncoding** const j = content_encoding_entries_end_;
+
+  while (i != j) {
+    ContentEncoding* const encoding = *i++;
+    delete encoding;
+  }
+
+  delete[] content_encoding_entries_;
 }
 
-Track::~Track()
-{
-    Info& info = const_cast<Info&>(m_info);
-    info.Clear();
+long Track::Create(Segment* pSegment, const Info& info, long long element_start,
+                   long long element_size, Track*& pResult) {
+  if (pResult)
+    return -1;
 
-    ContentEncoding** i = content_encoding_entries_;
-    ContentEncoding** const j = content_encoding_entries_end_;
+  Track* const pTrack =
+      new (std::nothrow) Track(pSegment, element_start, element_size);
 
-    while (i != j) {
-        ContentEncoding* const encoding = *i++;
-        delete encoding;
-    }
+  if (pTrack == NULL)
+    return -1;  // generic error
 
-    delete [] content_encoding_entries_;
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pResult = pTrack;
+  return 0;  // success
 }
 
-long Track::Create(
-    Segment* pSegment,
-    const Info& info,
-    long long element_start,
-    long long element_size,
-    Track*& pResult)
-{
-    if (pResult)
-        return -1;
+Track::Info::Info()
+    : uid(0),
+      defaultDuration(0),
+      codecDelay(0),
+      seekPreRoll(0),
+      nameAsUTF8(NULL),
+      language(NULL),
+      codecId(NULL),
+      codecNameAsUTF8(NULL),
+      codecPrivate(NULL),
+      codecPrivateSize(0),
+      lacing(false) {}
 
-    Track* const pTrack = new (std::nothrow) Track(pSegment,
-                                                   element_start,
-                                                   element_size);
+Track::Info::~Info() { Clear(); }
 
-    if (pTrack == NULL)
-        return -1;  //generic error
+void Track::Info::Clear() {
+  delete[] nameAsUTF8;
+  nameAsUTF8 = NULL;
 
-    const int status = info.Copy(pTrack->m_info);
+  delete[] language;
+  language = NULL;
 
-    if (status)  // error
-    {
-        delete pTrack;
-        return status;
-    }
+  delete[] codecId;
+  codecId = NULL;
 
-    pResult = pTrack;
-    return 0;  //success
+  delete[] codecPrivate;
+  codecPrivate = NULL;
+  codecPrivateSize = 0;
+
+  delete[] codecNameAsUTF8;
+  codecNameAsUTF8 = NULL;
 }
 
-Track::Info::Info():
-    uid(0),
-    defaultDuration(0),
-    codecDelay(0),
-    seekPreRoll(0),
-    nameAsUTF8(NULL),
-    language(NULL),
-    codecId(NULL),
-    codecNameAsUTF8(NULL),
-    codecPrivate(NULL),
-    codecPrivateSize(0),
-    lacing(false)
-{
-}
+int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
+  if (str == static_cast<char * Info::*>(NULL))
+    return -1;
 
-Track::Info::~Info()
-{
-    Clear();
-}
+  char*& dst = dst_.*str;
 
-void Track::Info::Clear()
-{
-    delete[] nameAsUTF8;
-    nameAsUTF8 = NULL;
+  if (dst)  // should be NULL already
+    return -1;
 
-    delete[] language;
-    language = NULL;
+  const char* const src = this->*str;
 
-    delete[] codecId;
-    codecId = NULL;
-
-    delete[] codecPrivate;
-    codecPrivate = NULL;
-    codecPrivateSize = 0;
-
-    delete[] codecNameAsUTF8;
-    codecNameAsUTF8 = NULL;
-}
-
-int Track::Info::CopyStr(char* Info::*str, Info& dst_) const
-{
-    if (str == static_cast<char* Info::*>(NULL))
-        return -1;
-
-    char*& dst = dst_.*str;
-
-    if (dst)  //should be NULL already
-        return -1;
-
-    const char* const src = this->*str;
-
-    if (src == NULL)
-        return 0;
-
-    const size_t len = strlen(src);
-
-    dst = new (std::nothrow) char[len+1];
-
-    if (dst == NULL)
-        return -1;
-
-    strcpy(dst, src);
-
+  if (src == NULL)
     return 0;
+
+  const size_t len = strlen(src);
+
+  dst = new (std::nothrow) char[len + 1];
+
+  if (dst == NULL)
+    return -1;
+
+  strcpy(dst, src);
+
+  return 0;
 }
 
+int Track::Info::Copy(Info& dst) const {
+  if (&dst == this)
+    return 0;
 
-int Track::Info::Copy(Info& dst) const
-{
-    if (&dst == this)
-        return 0;
+  dst.type = type;
+  dst.number = number;
+  dst.defaultDuration = defaultDuration;
+  dst.codecDelay = codecDelay;
+  dst.seekPreRoll = seekPreRoll;
+  dst.uid = uid;
+  dst.lacing = lacing;
+  dst.settings = settings;
 
-    dst.type = type;
-    dst.number = number;
-    dst.defaultDuration = defaultDuration;
-    dst.codecDelay = codecDelay;
-    dst.seekPreRoll = seekPreRoll;
-    dst.uid = uid;
-    dst.lacing = lacing;
-    dst.settings = settings;
+  // We now copy the string member variables from src to dst.
+  // This involves memory allocation so in principle the operation
+  // can fail (indeed, that's why we have Info::Copy), so we must
+  // report this to the caller.  An error return from this function
+  // therefore implies that the copy was only partially successful.
 
-    //We now copy the string member variables from src to dst.
-    //This involves memory allocation so in principle the operation
-    //can fail (indeed, that's why we have Info::Copy), so we must
-    //report this to the caller.  An error return from this function
-    //therefore implies that the copy was only partially successful.
+  if (int status = CopyStr(&Info::nameAsUTF8, dst))
+    return status;
 
-    if (int status = CopyStr(&Info::nameAsUTF8, dst))
-        return status;
+  if (int status = CopyStr(&Info::language, dst))
+    return status;
 
-    if (int status = CopyStr(&Info::language, dst))
-        return status;
+  if (int status = CopyStr(&Info::codecId, dst))
+    return status;
 
-    if (int status = CopyStr(&Info::codecId, dst))
-        return status;
+  if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
+    return status;
 
-    if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
-        return status;
+  if (codecPrivateSize > 0) {
+    if (codecPrivate == NULL)
+      return -1;
 
-    if (codecPrivateSize > 0)
-    {
-        if (codecPrivate == NULL)
-            return -1;
+    if (dst.codecPrivate)
+      return -1;
 
-        if (dst.codecPrivate)
-            return -1;
+    if (dst.codecPrivateSize != 0)
+      return -1;
 
-        if (dst.codecPrivateSize != 0)
-            return -1;
+    dst.codecPrivate = new (std::nothrow) unsigned char[codecPrivateSize];
 
-        dst.codecPrivate = new (std::nothrow) unsigned char[codecPrivateSize];
+    if (dst.codecPrivate == NULL)
+      return -1;
 
-        if (dst.codecPrivate == NULL)
-            return -1;
+    memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
+    dst.codecPrivateSize = codecPrivateSize;
+  }
 
-        memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
-        dst.codecPrivateSize = codecPrivateSize;
+  return 0;
+}
+
+const BlockEntry* Track::GetEOS() const { return &m_eos; }
+
+long Track::GetType() const { return m_info.type; }
+
+long Track::GetNumber() const { return m_info.number; }
+
+unsigned long long Track::GetUid() const { return m_info.uid; }
+
+const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; }
+
+const char* Track::GetLanguage() const { return m_info.language; }
+
+const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; }
+
+const char* Track::GetCodecId() const { return m_info.codecId; }
+
+const unsigned char* Track::GetCodecPrivate(size_t& size) const {
+  size = m_info.codecPrivateSize;
+  return m_info.codecPrivate;
+}
+
+bool Track::GetLacing() const { return m_info.lacing; }
+
+unsigned long long Track::GetDefaultDuration() const {
+  return m_info.defaultDuration;
+}
+
+unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; }
+
+unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; }
+
+long Track::GetFirst(const BlockEntry*& pBlockEntry) const {
+  const Cluster* pCluster = m_pSegment->GetFirst();
+
+  for (int i = 0;;) {
+    if (pCluster == NULL) {
+      pBlockEntry = GetEOS();
+      return 1;
     }
 
-    return 0;
-}
-
-const BlockEntry* Track::GetEOS() const
-{
-    return &m_eos;
-}
-
-long Track::GetType() const
-{
-    return m_info.type;
-}
-
-long Track::GetNumber() const
-{
-    return m_info.number;
-}
-
-unsigned long long Track::GetUid() const
-{
-    return m_info.uid;
-}
-
-const char* Track::GetNameAsUTF8() const
-{
-    return m_info.nameAsUTF8;
-}
-
-const char* Track::GetLanguage() const
-{
-    return m_info.language;
-}
-
-const char* Track::GetCodecNameAsUTF8() const
-{
-    return m_info.codecNameAsUTF8;
-}
-
-
-const char* Track::GetCodecId() const
-{
-    return m_info.codecId;
-}
-
-const unsigned char* Track::GetCodecPrivate(size_t& size) const
-{
-    size = m_info.codecPrivateSize;
-    return m_info.codecPrivate;
-}
-
-
-bool Track::GetLacing() const
-{
-    return m_info.lacing;
-}
-
-unsigned long long Track::GetDefaultDuration() const
-{
-    return m_info.defaultDuration;
-}
-
-unsigned long long Track::GetCodecDelay() const
-{
-    return m_info.codecDelay;
-}
-
-unsigned long long Track::GetSeekPreRoll() const
-{
-    return m_info.seekPreRoll;
-}
-
-long Track::GetFirst(const BlockEntry*& pBlockEntry) const
-{
-    const Cluster* pCluster = m_pSegment->GetFirst();
-
-    for (int i = 0; ; )
-    {
-        if (pCluster == NULL)
-        {
-            pBlockEntry = GetEOS();
-            return 1;
-        }
-
-        if (pCluster->EOS())
-        {
+    if (pCluster->EOS()) {
 #if 0
-            if (m_pSegment->Unparsed() <= 0)  //all clusters have been loaded
-            {
+            if (m_pSegment->Unparsed() <= 0) {  //all clusters have been loaded
                 pBlockEntry = GetEOS();
                 return 1;
             }
 #else
-            if (m_pSegment->DoneParsing())
-            {
-                pBlockEntry = GetEOS();
-                return 1;
-            }
+      if (m_pSegment->DoneParsing()) {
+        pBlockEntry = GetEOS();
+        return 1;
+      }
 #endif
 
-            pBlockEntry = 0;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long status = pCluster->GetFirst(pBlockEntry);
-
-        if (status < 0)  //error
-            return status;
-
-        if (pBlockEntry == 0)  //empty cluster
-        {
-            pCluster = m_pSegment->GetNext(pCluster);
-            continue;
-        }
-
-        for (;;)
-        {
-            const Block* const pBlock = pBlockEntry->GetBlock();
-            assert(pBlock);
-
-            const long long tn = pBlock->GetTrackNumber();
-
-            if ((tn == m_info.number) && VetEntry(pBlockEntry))
-                return 0;
-
-            const BlockEntry* pNextEntry;
-
-            status = pCluster->GetNext(pBlockEntry, pNextEntry);
-
-            if (status < 0)  //error
-                return status;
-
-            if (pNextEntry == 0)
-                break;
-
-            pBlockEntry = pNextEntry;
-        }
-
-        ++i;
-
-        if (i >= 100)
-            break;
-
-        pCluster = m_pSegment->GetNext(pCluster);
+      pBlockEntry = 0;
+      return E_BUFFER_NOT_FULL;
     }
 
-    //NOTE: if we get here, it means that we didn't find a block with
-    //a matching track number.  We interpret that as an error (which
-    //might be too conservative).
+    long status = pCluster->GetFirst(pBlockEntry);
 
-    pBlockEntry = GetEOS();  //so we can return a non-NULL value
-    return 1;
-}
+    if (status < 0)  // error
+      return status;
 
+    if (pBlockEntry == 0) {  // empty cluster
+      pCluster = m_pSegment->GetNext(pCluster);
+      continue;
+    }
 
-long Track::GetNext(
-    const BlockEntry* pCurrEntry,
-    const BlockEntry*& pNextEntry) const
-{
-    assert(pCurrEntry);
-    assert(!pCurrEntry->EOS());  //?
+    for (;;) {
+      const Block* const pBlock = pBlockEntry->GetBlock();
+      assert(pBlock);
 
-    const Block* const pCurrBlock = pCurrEntry->GetBlock();
-    assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
-    if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
-        return -1;
+      const long long tn = pBlock->GetTrackNumber();
 
-    const Cluster* pCluster = pCurrEntry->GetCluster();
-    assert(pCluster);
-    assert(!pCluster->EOS());
+      if ((tn == m_info.number) && VetEntry(pBlockEntry))
+        return 0;
 
-    long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+      const BlockEntry* pNextEntry;
 
-    if (status < 0)  //error
+      status = pCluster->GetNext(pBlockEntry, pNextEntry);
+
+      if (status < 0)  // error
         return status;
 
-    for (int i = 0; ; )
-    {
-        while (pNextEntry)
-        {
-            const Block* const pNextBlock = pNextEntry->GetBlock();
-            assert(pNextBlock);
+      if (pNextEntry == 0)
+        break;
 
-            if (pNextBlock->GetTrackNumber() == m_info.number)
-                return 0;
+      pBlockEntry = pNextEntry;
+    }
 
-            pCurrEntry = pNextEntry;
+    ++i;
 
-            status = pCluster->GetNext(pCurrEntry, pNextEntry);
+    if (i >= 100)
+      break;
 
-            if (status < 0) //error
-                return status;
-        }
+    pCluster = m_pSegment->GetNext(pCluster);
+  }
 
-        pCluster = m_pSegment->GetNext(pCluster);
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number.  We interpret that as an error (which
+  // might be too conservative).
 
-        if (pCluster == NULL)
-        {
-            pNextEntry = GetEOS();
-            return 1;
-        }
+  pBlockEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
 
-        if (pCluster->EOS())
-        {
+long Track::GetNext(const BlockEntry* pCurrEntry,
+                    const BlockEntry*& pNextEntry) const {
+  assert(pCurrEntry);
+  assert(!pCurrEntry->EOS());  //?
+
+  const Block* const pCurrBlock = pCurrEntry->GetBlock();
+  assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
+  if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
+    return -1;
+
+  const Cluster* pCluster = pCurrEntry->GetCluster();
+  assert(pCluster);
+  assert(!pCluster->EOS());
+
+  long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  for (int i = 0;;) {
+    while (pNextEntry) {
+      const Block* const pNextBlock = pNextEntry->GetBlock();
+      assert(pNextBlock);
+
+      if (pNextBlock->GetTrackNumber() == m_info.number)
+        return 0;
+
+      pCurrEntry = pNextEntry;
+
+      status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pCluster = m_pSegment->GetNext(pCluster);
+
+    if (pCluster == NULL) {
+      pNextEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
 #if 0
             if (m_pSegment->Unparsed() <= 0)   //all clusters have been loaded
             {
@@ -5855,155 +5105,148 @@
                 return 1;
             }
 #else
-            if (m_pSegment->DoneParsing())
-            {
-                pNextEntry = GetEOS();
-                return 1;
-            }
+      if (m_pSegment->DoneParsing()) {
+        pNextEntry = GetEOS();
+        return 1;
+      }
 #endif
 
-            //TODO: there is a potential O(n^2) problem here: we tell the
-            //caller to (pre)load another cluster, which he does, but then he
-            //calls GetNext again, which repeats the same search.  This is
-            //a pathological case, since the only way it can happen is if
-            //there exists a long sequence of clusters none of which contain a
-            // block from this track.  One way around this problem is for the
-            //caller to be smarter when he loads another cluster: don't call
-            //us back until you have a cluster that contains a block from this
-            //track. (Of course, that's not cheap either, since our caller
-            //would have to scan the each cluster as it's loaded, so that
-            //would just push back the problem.)
+      // TODO: there is a potential O(n^2) problem here: we tell the
+      // caller to (pre)load another cluster, which he does, but then he
+      // calls GetNext again, which repeats the same search.  This is
+      // a pathological case, since the only way it can happen is if
+      // there exists a long sequence of clusters none of which contain a
+      // block from this track.  One way around this problem is for the
+      // caller to be smarter when he loads another cluster: don't call
+      // us back until you have a cluster that contains a block from this
+      // track. (Of course, that's not cheap either, since our caller
+      // would have to scan the each cluster as it's loaded, so that
+      // would just push back the problem.)
 
-            pNextEntry = NULL;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        status = pCluster->GetFirst(pNextEntry);
-
-        if (status < 0)  //error
-            return status;
-
-        if (pNextEntry == NULL)  //empty cluster
-            continue;
-
-        ++i;
-
-        if (i >= 100)
-            break;
+      pNextEntry = NULL;
+      return E_BUFFER_NOT_FULL;
     }
 
-    //NOTE: if we get here, it means that we didn't find a block with
-    //a matching track number after lots of searching, so we give
-    //up trying.
+    status = pCluster->GetFirst(pNextEntry);
 
-    pNextEntry = GetEOS();  //so we can return a non-NULL value
-    return 1;
+    if (status < 0)  // error
+      return status;
+
+    if (pNextEntry == NULL)  // empty cluster
+      continue;
+
+    ++i;
+
+    if (i >= 100)
+      break;
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number after lots of searching, so we give
+  // up trying.
+
+  pNextEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
 }
 
-bool Track::VetEntry(const BlockEntry* pBlockEntry) const
-{
-    assert(pBlockEntry);
-    const Block* const pBlock = pBlockEntry->GetBlock();
-    assert(pBlock);
-    assert(pBlock->GetTrackNumber() == m_info.number);
-    if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
-        return false;
+bool Track::VetEntry(const BlockEntry* pBlockEntry) const {
+  assert(pBlockEntry);
+  const Block* const pBlock = pBlockEntry->GetBlock();
+  assert(pBlock);
+  assert(pBlock->GetTrackNumber() == m_info.number);
+  if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
+    return false;
 
-    // This function is used during a seek to determine whether the
-    // frame is a valid seek target.  This default function simply
-    // returns true, which means all frames are valid seek targets.
-    // It gets overridden by the VideoTrack class, because only video
-    // keyframes can be used as seek target.
+  // This function is used during a seek to determine whether the
+  // frame is a valid seek target.  This default function simply
+  // returns true, which means all frames are valid seek targets.
+  // It gets overridden by the VideoTrack class, because only video
+  // keyframes can be used as seek target.
 
-    return true;
+  return true;
 }
 
-long Track::Seek(
-    long long time_ns,
-    const BlockEntry*& pResult) const
-{
-    const long status = GetFirst(pResult);
+long Track::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
 
-    if (status < 0)  //buffer underflow, etc
-        return status;
+  if (status < 0)  // buffer underflow, etc
+    return status;
 
-    assert(pResult);
+  assert(pResult);
 
-    if (pResult->EOS())
-        return 0;
+  if (pResult->EOS())
+    return 0;
 
-    const Cluster* pCluster = pResult->GetCluster();
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not preloaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
     assert(pCluster);
     assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
 
-    if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
-        return 0;
+    const long long t = pCluster->GetTime();
 
-    Cluster** const clusters = m_pSegment->m_clusters;
-    assert(clusters);
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
 
-    const long count = m_pSegment->GetCount();  //loaded only, not preloaded
-    assert(count > 0);
+    assert(lo <= hi);
+  }
 
-    Cluster** const i = clusters + pCluster->GetIndex();
-    assert(i);
-    assert(*i == pCluster);
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  while (lo > i) {
+    pCluster = *--lo;
+    assert(pCluster);
     assert(pCluster->GetTime() <= time_ns);
 
-    Cluster** const j = clusters + count;
+    pResult = pCluster->GetEntry(this);
 
-    Cluster** lo = i;
-    Cluster** hi = j;
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
 
-    while (lo < hi)
-    {
-        //INVARIANT:
-        //[i, lo) <= time_ns
-        //[lo, hi) ?
-        //[hi, j)  > time_ns
+    // landed on empty cluster (no entries)
+  }
 
-        Cluster** const mid = lo + (hi - lo) / 2;
-        assert(mid < hi);
-
-        pCluster = *mid;
-        assert(pCluster);
-        assert(pCluster->GetIndex() >= 0);
-        assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
-
-        const long long t = pCluster->GetTime();
-
-        if (t <= time_ns)
-            lo = mid + 1;
-        else
-            hi = mid;
-
-        assert(lo <= hi);
-    }
-
-    assert(lo == hi);
-    assert(lo > i);
-    assert(lo <= j);
-
-    while (lo > i)
-    {
-        pCluster = *--lo;
-        assert(pCluster);
-        assert(pCluster->GetTime() <= time_ns);
-
-        pResult = pCluster->GetEntry(this);
-
-        if ((pResult != 0) && !pResult->EOS())
-            return 0;
-
-        //landed on empty cluster (no entries)
-    }
-
-    pResult = GetEOS();  //weird
-    return 0;
+  pResult = GetEOS();  // weird
+  return 0;
 }
 
-const ContentEncoding*
-Track::GetContentEncodingByIndex(unsigned long idx) const {
+const ContentEncoding* Track::GetContentEncodingByIndex(
+    unsigned long idx) const {
   const ptrdiff_t count =
       content_encoding_entries_end_ - content_encoding_entries_;
   assert(count >= 0);
@@ -6033,27 +5276,22 @@
   int count = 0;
   while (pos < stop) {
     long long id, size;
-    const long status = ParseElementHeader(pReader,
-                                           pos,
-                                           stop,
-                                           id,
-                                           size);
-    if (status < 0)  //error
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
-
-    //pos now designates start of element
+    // pos now designates start of element
     if (id == 0x2240)  // ContentEncoding ID
       ++count;
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ = new (std::nothrow) ContentEncoding* [count];
   if (!content_encoding_entries_)
     return -1;
 
@@ -6062,24 +5300,18 @@
   pos = start;
   while (pos < stop) {
     long long id, size;
-    long status = ParseElementHeader(pReader,
-                                     pos,
-                                     stop,
-                                     id,
-                                     size);
-    if (status < 0)  //error
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
       return status;
 
-    //pos now designates start of element
-    if (id == 0x2240) { // ContentEncoding ID
+    // pos now designates start of element
+    if (id == 0x2240) {  // ContentEncoding ID
       ContentEncoding* const content_encoding =
           new (std::nothrow) ContentEncoding();
       if (!content_encoding)
         return -1;
 
-      status = content_encoding->ParseContentEncodingEntry(pos,
-                                                           size,
-                                                           pReader);
+      status = content_encoding->ParseContentEncodingEntry(pos, size, pReader);
       if (status) {
         delete content_encoding;
         return status;
@@ -6088,7 +5320,7 @@
       *content_encoding_entries_end_++ = content_encoding;
     }
 
-    pos += size;  //consume payload
+    pos += size;  // consume payload
     assert(pos <= stop);
   }
 
@@ -6097,219 +5329,175 @@
   return 0;
 }
 
-Track::EOSBlock::EOSBlock() :
-    BlockEntry(NULL, LONG_MIN)
-{
-}
+Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {}
 
-BlockEntry::Kind Track::EOSBlock::GetKind() const
-{
-    return kBlockEOS;
-}
+BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
 
+const Block* Track::EOSBlock::GetBlock() const { return NULL; }
 
-const Block* Track::EOSBlock::GetBlock() const
-{
-    return NULL;
-}
+VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
 
+long VideoTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       VideoTrack*& pResult) {
+  if (pResult)
+    return -1;
 
-VideoTrack::VideoTrack(
-    Segment* pSegment,
-    long long element_start,
-    long long element_size) :
-    Track(pSegment, element_start, element_size)
-{
-}
+  if (info.type != Track::kVideo)
+    return -1;
 
+  long long width = 0;
+  long long height = 0;
+  double rate = 0.0;
 
-long VideoTrack::Parse(
-    Segment* pSegment,
-    const Info& info,
-    long long element_start,
-    long long element_size,
-    VideoTrack*& pResult)
-{
-    if (pResult)
-        return -1;
+  IMkvReader* const pReader = pSegment->m_pReader;
 
-    if (info.type != Track::kVideo)
-        return -1;
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
 
-    long long width = 0;
-    long long height = 0;
-    double rate = 0.0;
+  long long pos = s.start;
+  assert(pos >= 0);
 
-    IMkvReader* const pReader = pSegment->m_pReader;
+  const long long stop = pos + s.size;
 
-    const Settings& s = info.settings;
-    assert(s.start >= 0);
-    assert(s.size >= 0);
+  while (pos < stop) {
+    long long id, size;
 
-    long long pos = s.start;
-    assert(pos >= 0);
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
 
-    const long long stop = pos + s.size;
+    if (status < 0)  // error
+      return status;
 
-    while (pos < stop)
-    {
-        long long id, size;
+    if (id == 0x30) {  // pixel width
+      width = UnserializeUInt(pReader, pos, size);
 
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
+      if (width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x3A) {  // pixel height
+      height = UnserializeUInt(pReader, pos, size);
 
-        if (status < 0)  //error
-            return status;
+      if (height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x0383E3) {  // frame rate
+      const long status = UnserializeFloat(pReader, pos, size, rate);
 
-        if (id == 0x30)  //pixel width
-        {
-            width = UnserializeUInt(pReader, pos, size);
-
-            if (width <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x3A)  //pixel height
-        {
-            height = UnserializeUInt(pReader, pos, size);
-
-            if (height <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x0383E3)  //frame rate
-        {
-            const long status = UnserializeFloat(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    rate);
-
-            if (status < 0)
-                return status;
-
-            if (rate <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-
-    VideoTrack* const pTrack = new (std::nothrow) VideoTrack(pSegment,
-                                                             element_start,
-                                                             element_size);
-
-    if (pTrack == NULL)
-        return -1;  //generic error
-
-    const int status = info.Copy(pTrack->m_info);
-
-    if (status)  // error
-    {
-        delete pTrack;
-        return status;
-    }
-
-    pTrack->m_width = width;
-    pTrack->m_height = height;
-    pTrack->m_rate = rate;
-
-    pResult = pTrack;
-    return 0;  //success
-}
-
-
-bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const
-{
-    return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
-}
-
-long VideoTrack::Seek(
-    long long time_ns,
-    const BlockEntry*& pResult) const
-{
-    const long status = GetFirst(pResult);
-
-    if (status < 0)  //buffer underflow, etc
+      if (status < 0)
         return status;
 
-    assert(pResult);
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
 
-    if (pResult->EOS())
-        return 0;
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-    const Cluster* pCluster = pResult->GetCluster();
+  assert(pos == stop);
+
+  VideoTrack* const pTrack =
+      new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_width = width;
+  pTrack->m_height = height;
+  pTrack->m_rate = rate;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const {
+  return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
+}
+
+long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not pre-loaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
     assert(pCluster);
     assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
 
-    if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
-        return 0;
+    const long long t = pCluster->GetTime();
 
-    Cluster** const clusters = m_pSegment->m_clusters;
-    assert(clusters);
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
 
-    const long count = m_pSegment->GetCount();  //loaded only, not pre-loaded
-    assert(count > 0);
+    assert(lo <= hi);
+  }
 
-    Cluster** const i = clusters + pCluster->GetIndex();
-    assert(i);
-    assert(*i == pCluster);
-    assert(pCluster->GetTime() <= time_ns);
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
 
-    Cluster** const j = clusters + count;
+  pCluster = *--lo;
+  assert(pCluster);
+  assert(pCluster->GetTime() <= time_ns);
 
-    Cluster** lo = i;
-    Cluster** hi = j;
+  pResult = pCluster->GetEntry(this, time_ns);
 
-    while (lo < hi)
-    {
-        //INVARIANT:
-        //[i, lo) <= time_ns
-        //[lo, hi) ?
-        //[hi, j)  > time_ns
+  if ((pResult != 0) && !pResult->EOS())  // found a keyframe
+    return 0;
 
-        Cluster** const mid = lo + (hi - lo) / 2;
-        assert(mid < hi);
-
-        pCluster = *mid;
-        assert(pCluster);
-        assert(pCluster->GetIndex() >= 0);
-        assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
-
-        const long long t = pCluster->GetTime();
-
-        if (t <= time_ns)
-            lo = mid + 1;
-        else
-            hi = mid;
-
-        assert(lo <= hi);
-    }
-
-    assert(lo == hi);
-    assert(lo > i);
-    assert(lo <= j);
-
+  while (lo != i) {
     pCluster = *--lo;
     assert(pCluster);
     assert(pCluster->GetTime() <= time_ns);
 
-    pResult = pCluster->GetEntry(this, time_ns);
-
-    if ((pResult != 0) && !pResult->EOS())  //found a keyframe
-        return 0;
-
-    while (lo != i)
-    {
-        pCluster = *--lo;
-        assert(pCluster);
-        assert(pCluster->GetTime() <= time_ns);
-
 #if 0
         //TODO:
         //We need to handle the case when a cluster
@@ -6318,651 +5506,501 @@
         //good enough.
         pResult = pCluster->GetMaxKey(this);
 #else
-        pResult = pCluster->GetEntry(this, time_ns);
+    pResult = pCluster->GetEntry(this, time_ns);
 #endif
 
-        if ((pResult != 0) && !pResult->EOS())
-            return 0;
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+  }
+
+  // weird: we're on the first cluster, but no keyframe found
+  // should never happen but we must return something anyway
+
+  pResult = GetEOS();
+  return 0;
+}
+
+long long VideoTrack::GetWidth() const { return m_width; }
+
+long long VideoTrack::GetHeight() const { return m_height; }
+
+double VideoTrack::GetFrameRate() const { return m_rate; }
+
+AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
+
+long AudioTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       AudioTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kAudio)
+    return -1;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  double rate = 8000.0;  // MKV default
+  long long channels = 1;
+  long long bit_depth = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == 0x35) {  // Sample Rate
+      status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x1F) {  // Channel Count
+      channels = UnserializeUInt(pReader, pos, size);
+
+      if (channels <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x2264) {  // Bit Depth
+      bit_depth = UnserializeUInt(pReader, pos, size);
+
+      if (bit_depth <= 0)
+        return E_FILE_FORMAT_INVALID;
     }
 
-    //weird: we're on the first cluster, but no keyframe found
-    //should never happen but we must return something anyway
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-    pResult = GetEOS();
-    return 0;
+  assert(pos == stop);
+
+  AudioTrack* const pTrack =
+      new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_rate = rate;
+  pTrack->m_channels = channels;
+  pTrack->m_bitDepth = bit_depth;
+
+  pResult = pTrack;
+  return 0;  // success
 }
 
+double AudioTrack::GetSamplingRate() const { return m_rate; }
 
-long long VideoTrack::GetWidth() const
-{
-    return m_width;
-}
+long long AudioTrack::GetChannels() const { return m_channels; }
 
+long long AudioTrack::GetBitDepth() const { return m_bitDepth; }
 
-long long VideoTrack::GetHeight() const
-{
-    return m_height;
-}
+Tracks::Tracks(Segment* pSegment, long long start, long long size_,
+               long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_trackEntries(NULL),
+      m_trackEntriesEnd(NULL) {}
 
+long Tracks::Parse() {
+  assert(m_trackEntries == NULL);
+  assert(m_trackEntriesEnd == NULL);
 
-double VideoTrack::GetFrameRate() const
-{
-    return m_rate;
-}
+  const long long stop = m_start + m_size;
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
+  int count = 0;
+  long long pos = m_start;
 
-AudioTrack::AudioTrack(
-    Segment* pSegment,
-    long long element_start,
-    long long element_size) :
-    Track(pSegment, element_start, element_size)
-{
-}
+  while (pos < stop) {
+    long long id, size;
 
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
 
-long AudioTrack::Parse(
-    Segment* pSegment,
-    const Info& info,
-    long long element_start,
-    long long element_size,
-    AudioTrack*& pResult)
-{
-    if (pResult)
-        return -1;
+    if (status < 0)  // error
+      return status;
 
-    if (info.type != Track::kAudio)
-        return -1;
+    if (size == 0)  // weird
+      continue;
 
-    IMkvReader* const pReader = pSegment->m_pReader;
+    if (id == 0x2E)  // TrackEntry ID
+      ++count;
 
-    const Settings& s = info.settings;
-    assert(s.start >= 0);
-    assert(s.size >= 0);
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
 
-    long long pos = s.start;
-    assert(pos >= 0);
+  assert(pos == stop);
 
-    const long long stop = pos + s.size;
+  if (count <= 0)
+    return 0;  // success
 
-    double rate = 8000.0;  // MKV default
-    long long channels = 1;
-    long long bit_depth = 0;
+  m_trackEntries = new (std::nothrow) Track* [count];
 
-    while (pos < stop)
-    {
-        long long id, size;
+  if (m_trackEntries == NULL)
+    return -1;
 
-        long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
+  m_trackEntriesEnd = m_trackEntries;
 
-        if (status < 0)  //error
-            return status;
+  pos = m_start;
 
-        if (id == 0x35)  //Sample Rate
-        {
-            status = UnserializeFloat(pReader, pos, size, rate);
+  while (pos < stop) {
+    const long long element_start = pos;
 
-            if (status < 0)
-                return status;
+    long long id, payload_size;
 
-            if (rate <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x1F)  //Channel Count
-        {
-            channels = UnserializeUInt(pReader, pos, size);
+    const long status =
+        ParseElementHeader(pReader, pos, stop, id, payload_size);
 
-            if (channels <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x2264)  //Bit Depth
-        {
-            bit_depth = UnserializeUInt(pReader, pos, size);
+    if (status < 0)  // error
+      return status;
 
-            if (bit_depth <= 0)
-                return E_FILE_FORMAT_INVALID;
-        }
+    if (payload_size == 0)  // weird
+      continue;
 
-        pos += size;  //consume payload
-        assert(pos <= stop);
+    const long long payload_stop = pos + payload_size;
+    assert(payload_stop <= stop);  // checked in ParseElement
+
+    const long long element_size = payload_stop - element_start;
+
+    if (id == 0x2E) {  // TrackEntry ID
+      Track*& pTrack = *m_trackEntriesEnd;
+      pTrack = NULL;
+
+      const long status = ParseTrackEntry(pos, payload_size, element_start,
+                                          element_size, pTrack);
+
+      if (status)
+        return status;
+
+      if (pTrack)
+        ++m_trackEntriesEnd;
     }
 
-    assert(pos == stop);
+    pos = payload_stop;
+    assert(pos <= stop);
+  }
 
-    AudioTrack* const pTrack = new (std::nothrow) AudioTrack(pSegment,
-                                                             element_start,
-                                                             element_size);
+  assert(pos == stop);
 
-    if (pTrack == NULL)
-        return -1;  //generic error
+  return 0;  // success
+}
 
-    const int status = info.Copy(pTrack->m_info);
+unsigned long Tracks::GetTracksCount() const {
+  const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
+  assert(result >= 0);
+
+  return static_cast<unsigned long>(result);
+}
+
+long Tracks::ParseTrackEntry(long long track_start, long long track_size,
+                             long long element_start, long long element_size,
+                             Track*& pResult) const {
+  if (pResult)
+    return -1;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = track_start;
+  const long long track_stop = track_start + track_size;
+
+  Track::Info info;
+
+  info.type = 0;
+  info.number = 0;
+  info.uid = 0;
+  info.defaultDuration = 0;
+
+  Track::Settings v;
+  v.start = -1;
+  v.size = -1;
+
+  Track::Settings a;
+  a.start = -1;
+  a.size = -1;
+
+  Track::Settings e;  // content_encodings_settings;
+  e.start = -1;
+  e.size = -1;
+
+  long long lacing = 1;  // default is true
+
+  while (pos < track_stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, track_stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long start = pos;
+
+    if (id == 0x60) {  // VideoSettings ID
+      v.start = start;
+      v.size = size;
+    } else if (id == 0x61) {  // AudioSettings ID
+      a.start = start;
+      a.size = size;
+    } else if (id == 0x2D80) {  // ContentEncodings ID
+      e.start = start;
+      e.size = size;
+    } else if (id == 0x33C5) {  // Track UID
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      info.uid = 0;
+
+      long long pos_ = start;
+      const long long pos_end = start + size;
+
+      while (pos_ != pos_end) {
+        unsigned char b;
+
+        const int status = pReader->Read(pos_, 1, &b);
+
+        if (status)
+          return status;
+
+        info.uid <<= 8;
+        info.uid |= b;
+
+        ++pos_;
+      }
+    } else if (id == 0x57) {  // Track Number
+      const long long num = UnserializeUInt(pReader, pos, size);
+
+      if ((num <= 0) || (num > 127))
+        return E_FILE_FORMAT_INVALID;
+
+      info.number = static_cast<long>(num);
+    } else if (id == 0x03) {  // Track Type
+      const long long type = UnserializeUInt(pReader, pos, size);
+
+      if ((type <= 0) || (type > 254))
+        return E_FILE_FORMAT_INVALID;
+
+      info.type = static_cast<long>(type);
+    } else if (id == 0x136E) {  // Track Name
+      const long status =
+          UnserializeString(pReader, pos, size, info.nameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == 0x02B59C) {  // Track Language
+      const long status = UnserializeString(pReader, pos, size, info.language);
+
+      if (status)
+        return status;
+    } else if (id == 0x03E383) {  // Default Duration
+      const long long duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      info.defaultDuration = static_cast<unsigned long long>(duration);
+    } else if (id == 0x06) {  // CodecID
+      const long status = UnserializeString(pReader, pos, size, info.codecId);
+
+      if (status)
+        return status;
+    } else if (id == 0x1C) {  // lacing
+      lacing = UnserializeUInt(pReader, pos, size);
+
+      if ((lacing < 0) || (lacing > 1))
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == 0x23A2) {  // Codec Private
+      delete[] info.codecPrivate;
+      info.codecPrivate = NULL;
+      info.codecPrivateSize = 0;
+
+      const size_t buflen = static_cast<size_t>(size);
+
+      if (buflen) {
+        typedef unsigned char* buf_t;
+
+        const buf_t buf = new (std::nothrow) unsigned char[buflen];
+
+        if (buf == NULL)
+          return -1;
+
+        const int status = pReader->Read(pos, static_cast<long>(buflen), buf);
+
+        if (status) {
+          delete[] buf;
+          return status;
+        }
+
+        info.codecPrivate = buf;
+        info.codecPrivateSize = buflen;
+      }
+    } else if (id == 0x058688) {  // Codec Name
+      const long status =
+          UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == 0x16AA) {  // Codec Delay
+      info.codecDelay = UnserializeUInt(pReader, pos, size);
+    } else if (id == 0x16BB) {  // Seek Pre Roll
+      info.seekPreRoll = UnserializeUInt(pReader, pos, size);
+    }
+
+    pos += size;  // consume payload
+    assert(pos <= track_stop);
+  }
+
+  assert(pos == track_stop);
+
+  if (info.number <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  if (GetTrackByNumber(info.number))
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.type <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  info.lacing = (lacing > 0) ? true : false;
+
+  if (info.type == Track::kVideo) {
+    if (v.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = v;
+
+    VideoTrack* pTrack = NULL;
+
+    const long status = VideoTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
 
     if (status)
-    {
-        delete pTrack;
-        return status;
-    }
-
-    pTrack->m_rate = rate;
-    pTrack->m_channels = channels;
-    pTrack->m_bitDepth = bit_depth;
+      return status;
 
     pResult = pTrack;
-    return 0;  //success
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else if (info.type == Track::kAudio) {
+    if (a.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = a;
+
+    AudioTrack* pTrack = NULL;
+
+    const long status = AudioTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else {
+    // neither video nor audio - probably metadata or subtitles
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (e.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings.start = -1;
+    info.settings.size = 0;
+
+    Track* pTrack = NULL;
+
+    const long status =
+        Track::Create(m_pSegment, info, element_start, element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+  }
+
+  return 0;  // success
 }
 
+Tracks::~Tracks() {
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
 
-double AudioTrack::GetSamplingRate() const
-{
-    return m_rate;
+  while (i != j) {
+    Track* const pTrack = *i++;
+    delete pTrack;
+  }
+
+  delete[] m_trackEntries;
 }
 
+const Track* Tracks::GetTrackByNumber(long tn) const {
+  if (tn < 0)
+    return NULL;
 
-long long AudioTrack::GetChannels() const
-{
-    return m_channels;
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+
+    if (pTrack == NULL)
+      continue;
+
+    if (tn == pTrack->GetNumber())
+      return pTrack;
+  }
+
+  return NULL;  // not found
 }
 
-long long AudioTrack::GetBitDepth() const
-{
-    return m_bitDepth;
-}
+const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
+  const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
 
-Tracks::Tracks(
-    Segment* pSegment,
-    long long start,
-    long long size_,
-    long long element_start,
-    long long element_size) :
-    m_pSegment(pSegment),
-    m_start(start),
-    m_size(size_),
-    m_element_start(element_start),
-    m_element_size(element_size),
-    m_trackEntries(NULL),
-    m_trackEntriesEnd(NULL)
-{
-}
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
 
-
-long Tracks::Parse()
-{
-    assert(m_trackEntries == NULL);
-    assert(m_trackEntriesEnd == NULL);
-
-    const long long stop = m_start + m_size;
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    int count = 0;
-    long long pos = m_start;
-
-    while (pos < stop)
-    {
-        long long id, size;
-
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                size);
-
-        if (status < 0)  //error
-            return status;
-
-        if (size == 0)  //weird
-            continue;
-
-        if (id == 0x2E)  //TrackEntry ID
-            ++count;
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-
-    if (count <= 0)
-        return 0;  //success
-
-    m_trackEntries = new (std::nothrow) Track*[count];
-
-    if (m_trackEntries == NULL)
-        return -1;
-
-    m_trackEntriesEnd = m_trackEntries;
-
-    pos = m_start;
-
-    while (pos < stop)
-    {
-        const long long element_start = pos;
-
-        long long id, payload_size;
-
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                stop,
-                                id,
-                                payload_size);
-
-        if (status < 0)  //error
-            return status;
-
-        if (payload_size == 0)  //weird
-            continue;
-
-        const long long payload_stop = pos + payload_size;
-        assert(payload_stop <= stop);  //checked in ParseElement
-
-        const long long element_size = payload_stop - element_start;
-
-        if (id == 0x2E)  //TrackEntry ID
-        {
-            Track*& pTrack = *m_trackEntriesEnd;
-            pTrack = NULL;
-
-            const long status = ParseTrackEntry(
-                                    pos,
-                                    payload_size,
-                                    element_start,
-                                    element_size,
-                                    pTrack);
-
-            if (status)
-                return status;
-
-            if (pTrack)
-                ++m_trackEntriesEnd;
-        }
-
-        pos = payload_stop;
-        assert(pos <= stop);
-    }
-
-    assert(pos == stop);
-
-    return 0;  //success
-}
-
-
-unsigned long Tracks::GetTracksCount() const
-{
-    const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
-    assert(result >= 0);
-
-    return static_cast<unsigned long>(result);
-}
-
-long Tracks::ParseTrackEntry(
-    long long track_start,
-    long long track_size,
-    long long element_start,
-    long long element_size,
-    Track*& pResult) const
-{
-    if (pResult)
-        return -1;
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long pos = track_start;
-    const long long track_stop = track_start + track_size;
-
-    Track::Info info;
-
-    info.type = 0;
-    info.number = 0;
-    info.uid = 0;
-    info.defaultDuration = 0;
-
-    Track::Settings v;
-    v.start = -1;
-    v.size = -1;
-
-    Track::Settings a;
-    a.start = -1;
-    a.size = -1;
-
-    Track::Settings e;  //content_encodings_settings;
-    e.start = -1;
-    e.size = -1;
-
-    long long lacing = 1;  //default is true
-
-    while (pos < track_stop)
-    {
-        long long id, size;
-
-        const long status = ParseElementHeader(
-                                pReader,
-                                pos,
-                                track_stop,
-                                id,
-                                size);
-
-        if (status < 0)  //error
-            return status;
-
-        if (size < 0)
-            return E_FILE_FORMAT_INVALID;
-
-        const long long start = pos;
-
-        if (id == 0x60)  // VideoSettings ID
-        {
-            v.start = start;
-            v.size = size;
-        }
-        else if (id == 0x61)  // AudioSettings ID
-        {
-            a.start = start;
-            a.size = size;
-        }
-        else if (id == 0x2D80) // ContentEncodings ID
-        {
-            e.start = start;
-            e.size = size;
-        }
-        else if (id == 0x33C5)  //Track UID
-        {
-            if (size > 8)
-                return E_FILE_FORMAT_INVALID;
-
-            info.uid = 0;
-
-            long long pos_ = start;
-            const long long pos_end = start + size;
-
-            while (pos_ != pos_end)
-            {
-                unsigned char b;
-
-                const int status = pReader->Read(pos_, 1, &b);
-
-                if (status)
-                    return status;
-
-                info.uid <<= 8;
-                info.uid |= b;
-
-                ++pos_;
-            }
-        }
-        else if (id == 0x57)  //Track Number
-        {
-            const long long num = UnserializeUInt(pReader, pos, size);
-
-            if ((num <= 0) || (num > 127))
-                return E_FILE_FORMAT_INVALID;
-
-            info.number = static_cast<long>(num);
-        }
-        else if (id == 0x03)  //Track Type
-        {
-            const long long type = UnserializeUInt(pReader, pos, size);
-
-            if ((type <= 0) || (type > 254))
-                return E_FILE_FORMAT_INVALID;
-
-            info.type = static_cast<long>(type);
-        }
-        else if (id == 0x136E)  //Track Name
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    info.nameAsUTF8);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x02B59C)  //Track Language
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    info.language);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x03E383)  //Default Duration
-        {
-            const long long duration = UnserializeUInt(pReader, pos, size);
-
-            if (duration < 0)
-                return E_FILE_FORMAT_INVALID;
-
-            info.defaultDuration = static_cast<unsigned long long>(duration);
-        }
-        else if (id == 0x06)  //CodecID
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    info.codecId);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x1C)  //lacing
-        {
-            lacing = UnserializeUInt(pReader, pos, size);
-
-            if ((lacing < 0) || (lacing > 1))
-                return E_FILE_FORMAT_INVALID;
-        }
-        else if (id == 0x23A2)  //Codec Private
-        {
-            delete[] info.codecPrivate;
-            info.codecPrivate = NULL;
-            info.codecPrivateSize = 0;
-
-            const size_t buflen = static_cast<size_t>(size);
-
-            if (buflen)
-            {
-                typedef unsigned char* buf_t;
-
-                const buf_t buf = new (std::nothrow) unsigned char[buflen];
-
-                if (buf == NULL)
-                    return -1;
-
-                const int status = pReader->Read(pos, buflen, buf);
-
-                if (status)
-                {
-                    delete[] buf;
-                    return status;
-                }
-
-                info.codecPrivate = buf;
-                info.codecPrivateSize = buflen;
-            }
-        }
-        else if (id == 0x058688)  //Codec Name
-        {
-            const long status = UnserializeString(
-                                    pReader,
-                                    pos,
-                                    size,
-                                    info.codecNameAsUTF8);
-
-            if (status)
-                return status;
-        }
-        else if (id == 0x16AA)  //Codec Delay
-        {
-            info.codecDelay = UnserializeUInt(pReader, pos, size);
-
-        }
-        else if (id == 0x16BB) //Seek Pre Roll
-        {
-            info.seekPreRoll = UnserializeUInt(pReader, pos, size);
-        }
-
-        pos += size;  //consume payload
-        assert(pos <= track_stop);
-    }
-
-    assert(pos == track_stop);
-
-    if (info.number <= 0)  //not specified
-        return E_FILE_FORMAT_INVALID;
-
-    if (GetTrackByNumber(info.number))
-        return E_FILE_FORMAT_INVALID;
-
-    if (info.type <= 0)  //not specified
-        return E_FILE_FORMAT_INVALID;
-
-    info.lacing = (lacing > 0) ? true : false;
-
-    if (info.type == Track::kVideo)
-    {
-        if (v.start < 0)
-            return E_FILE_FORMAT_INVALID;
-
-        if (a.start >= 0)
-            return E_FILE_FORMAT_INVALID;
-
-        info.settings = v;
-
-        VideoTrack* pTrack = NULL;
-
-        const long status = VideoTrack::Parse(m_pSegment,
-                                              info,
-                                              element_start,
-                                              element_size,
-                                              pTrack);
-
-        if (status)
-            return status;
-
-        pResult = pTrack;
-        assert(pResult);
-
-        if (e.start >= 0)
-            pResult->ParseContentEncodingsEntry(e.start, e.size);
-    }
-    else if (info.type == Track::kAudio)
-    {
-        if (a.start < 0)
-            return E_FILE_FORMAT_INVALID;
-
-        if (v.start >= 0)
-            return E_FILE_FORMAT_INVALID;
-
-        info.settings = a;
-
-        AudioTrack* pTrack = NULL;
-
-        const long status = AudioTrack::Parse(m_pSegment,
-                                              info,
-                                              element_start,
-                                              element_size,
-                                              pTrack);
-
-        if (status)
-            return status;
-
-        pResult = pTrack;
-        assert(pResult);
-
-        if (e.start >= 0)
-            pResult->ParseContentEncodingsEntry(e.start, e.size);
-    }
-    else
-    {
-        // neither video nor audio - probably metadata or subtitles
-
-        if (a.start >= 0)
-            return E_FILE_FORMAT_INVALID;
-
-        if (v.start >= 0)
-            return E_FILE_FORMAT_INVALID;
-
-        if (e.start >= 0)
-            return E_FILE_FORMAT_INVALID;
-
-        info.settings.start = -1;
-        info.settings.size = 0;
-
-        Track* pTrack = NULL;
-
-        const long status = Track::Create(m_pSegment,
-                                          info,
-                                          element_start,
-                                          element_size,
-                                          pTrack);
-
-        if (status)
-            return status;
-
-        pResult = pTrack;
-        assert(pResult);
-    }
-
-    return 0;  //success
-}
-
-
-Tracks::~Tracks()
-{
-    Track** i = m_trackEntries;
-    Track** const j = m_trackEntriesEnd;
-
-    while (i != j)
-    {
-        Track* const pTrack = *i++;
-        delete pTrack;
-    }
-
-    delete[] m_trackEntries;
-}
-
-const Track* Tracks::GetTrackByNumber(long tn) const
-{
-    if (tn < 0)
-        return NULL;
-
-    Track** i = m_trackEntries;
-    Track** const j = m_trackEntriesEnd;
-
-    while (i != j)
-    {
-        Track* const pTrack = *i++;
-
-        if (pTrack == NULL)
-            continue;
-
-        if (tn == pTrack->GetNumber())
-            return pTrack;
-    }
-
-    return NULL;  //not found
-}
-
-
-const Track* Tracks::GetTrackByIndex(unsigned long idx) const
-{
-    const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
-
-    if (idx >= static_cast<unsigned long>(count))
-         return NULL;
-
-    return m_trackEntries[idx];
+  return m_trackEntries[idx];
 }
 
 #if 0
@@ -6984,104 +6022,100 @@
 }
 #endif
 
+long Cluster::Load(long long& pos, long& len) const {
+  assert(m_pSegment);
+  assert(m_pos >= m_element_start);
 
-long Cluster::Load(long long& pos, long& len) const
-{
-    assert(m_pSegment);
-    assert(m_pos >= m_element_start);
+  if (m_timecode >= 0)  // at least partially loaded
+    return 0;
 
-    if (m_timecode >= 0)  //at least partially loaded
-        return 0;
+  assert(m_pos == m_element_start);
+  assert(m_element_size < 0);
 
-    assert(m_pos == m_element_start);
-    assert(m_element_size < 0);
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
-    IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long total, avail;
 
-    long long total, avail;
+  const int status = pReader->Length(&total, &avail);
 
-    const int status = pReader->Length(&total, &avail);
+  if (status < 0)  // error
+    return status;
 
-    if (status < 0)  //error
-        return status;
+  assert((total < 0) || (avail <= total));
+  assert((total < 0) || (m_pos <= total));  // TODO: verify this
 
-    assert((total < 0) || (avail <= total));
-    assert((total < 0) || (m_pos <= total));  //TODO: verify this
+  pos = m_pos;
 
-    pos = m_pos;
+  long long cluster_size = -1;
 
-    long long cluster_size = -1;
-
-    {
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long long result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error or underflow
-            return static_cast<long>(result);
-
-        if (result > 0)  //underflow (weird)
-            return E_BUFFER_NOT_FULL;
-
-        //if ((pos + len) > segment_stop)
-        //    return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long id_ = ReadUInt(pReader, pos, len);
-
-        if (id_ < 0)  //error
-            return static_cast<long>(id_);
-
-        if (id_ != 0x0F43B675)  //Cluster ID
-            return E_FILE_FORMAT_INVALID;
-
-        pos += len;  //consume id
-
-        //read cluster size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        //if ((pos + len) > segment_stop)
-        //    return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(cluster_size);
-
-        if (size == 0)
-            return E_FILE_FORMAT_INVALID;  //TODO: verify this
-
-        pos += len;  //consume length of size of element
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size != unknown_size)
-            cluster_size = size;
+  {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
 
-    //pos points to start of payload
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error or underflow
+      return static_cast<long>(result);
+
+    if (result > 0)  // underflow (weird)
+      return E_BUFFER_NOT_FULL;
+
+    // if ((pos + len) > segment_stop)
+    //    return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id_ = ReadUInt(pReader, pos, len);
+
+    if (id_ < 0)  // error
+      return static_cast<long>(id_);
+
+    if (id_ != 0x0F43B675)  // Cluster ID
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume id
+
+    // read cluster size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    // if ((pos + len) > segment_stop)
+    //    return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(cluster_size);
+
+    if (size == 0)
+      return E_FILE_FORMAT_INVALID;  // TODO: verify this
+
+    pos += len;  // consume length of size of element
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size != unknown_size)
+      cluster_size = size;
+  }
+
+// pos points to start of payload
 
 #if 0
     len = static_cast<long>(size_);
@@ -7090,403 +6124,376 @@
         return E_BUFFER_NOT_FULL;
 #endif
 
-    long long timecode = -1;
-    long long new_pos = -1;
-    bool bBlock = false;
+  long long timecode = -1;
+  long long new_pos = -1;
+  bool bBlock = false;
 
-    long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
+  long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
 
-    for (;;)
-    {
-        if ((cluster_stop >= 0) && (pos >= cluster_stop))
-            break;
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
 
-        //Parse ID
+    // Parse ID
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long long result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long id = ReadUInt(pReader, pos, len);
-
-        if (id < 0) //error
-            return static_cast<long>(id);
-
-        if (id == 0)
-            return E_FILE_FORMAT_INVALID;
-
-        //This is the distinguished set of ID's we use to determine
-        //that we have exhausted the sub-element's inside the cluster
-        //whose ID we parsed earlier.
-
-        if (id == 0x0F43B675)  //Cluster ID
-            break;
-
-        if (id == 0x0C53BB6B)  //Cues ID
-            break;
-
-        pos += len;  //consume ID field
-
-        //Parse Size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;
-
-        pos += len;  //consume size field
-
-        if ((cluster_stop >= 0) && (pos > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        //pos now points to start of payload
-
-        if (size == 0)  //weird
-            continue;
-
-        if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if (id == 0x67)  //TimeCode ID
-        {
-            len = static_cast<long>(size);
-
-            if ((pos + size) > avail)
-                return E_BUFFER_NOT_FULL;
-
-            timecode = UnserializeUInt(pReader, pos, size);
-
-            if (timecode < 0)  //error (or underflow)
-                return static_cast<long>(timecode);
-
-            new_pos = pos + size;
-
-            if (bBlock)
-                break;
-        }
-        else if (id == 0x20)  //BlockGroup ID
-        {
-            bBlock = true;
-            break;
-        }
-        else if (id == 0x23)  //SimpleBlock ID
-        {
-            bBlock = true;
-            break;
-        }
-
-        pos += size;  //consume payload
-        assert((cluster_stop < 0) || (pos <= cluster_stop));
-    }
-
-    assert((cluster_stop < 0) || (pos <= cluster_stop));
-
-    if (timecode < 0)  //no timecode found
-        return E_FILE_FORMAT_INVALID;
-
-    if (!bBlock)
-        return E_FILE_FORMAT_INVALID;
-
-    m_pos = new_pos;  //designates position just beyond timecode payload
-    m_timecode = timecode;  // m_timecode >= 0 means we're partially loaded
-
-    if (cluster_size >= 0)
-        m_element_size = cluster_stop - m_element_start;
-
-    return 0;
-}
-
-
-long Cluster::Parse(long long& pos, long& len) const
-{
-    long status = Load(pos, len);
-
-    if (status < 0)
-        return status;
-
-    assert(m_pos >= m_element_start);
-    assert(m_timecode >= 0);
-    //assert(m_size > 0);
-    //assert(m_element_size > m_size);
-
-    const long long cluster_stop =
-        (m_element_size < 0) ? -1 : m_element_start + m_element_size;
-
-    if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
-        return 1;  //nothing else to do
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long total, avail;
-
-    status = pReader->Length(&total, &avail);
-
-    if (status < 0)  //error
-        return status;
-
-    assert((total < 0) || (avail <= total));
-
-    pos = m_pos;
-
-    for (;;)
-    {
-        if ((cluster_stop >= 0) && (pos >= cluster_stop))
-            break;
-
-        if ((total >= 0) && (pos >= total))
-        {
-            if (m_element_size < 0)
-                m_element_size = pos - m_element_start;
-
-            break;
-        }
-
-        //Parse ID
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long long result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long id = ReadUInt(pReader, pos, len);
-
-        if (id < 0) //error
-            return static_cast<long>(id);
-
-        if (id == 0)  //weird
-            return E_FILE_FORMAT_INVALID;
-
-        //This is the distinguished set of ID's we use to determine
-        //that we have exhausted the sub-element's inside the cluster
-        //whose ID we parsed earlier.
-
-        if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) //Cluster or Cues ID
-        {
-            if (m_element_size < 0)
-                m_element_size = pos - m_element_start;
-
-            break;
-        }
-
-        pos += len;  //consume ID field
-
-        //Parse Size
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;
-
-        pos += len;  //consume size field
-
-        if ((cluster_stop >= 0) && (pos > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        //pos now points to start of payload
-
-        if (size == 0)  //weird
-            continue;
-
-        //const long long block_start = pos;
-        const long long block_stop = pos + size;
-
-        if (cluster_stop >= 0)
-        {
-            if (block_stop > cluster_stop)
-            {
-                if ((id == 0x20) || (id == 0x23))
-                    return E_FILE_FORMAT_INVALID;
-
-                pos = cluster_stop;
-                break;
-            }
-        }
-        else if ((total >= 0) && (block_stop > total))
-        {
-            m_element_size = total - m_element_start;
-            pos = total;
-            break;
-        }
-        else if (block_stop > avail)
-        {
-            len = static_cast<long>(size);
-            return E_BUFFER_NOT_FULL;
-        }
-
-        Cluster* const this_ = const_cast<Cluster*>(this);
-
-        if (id == 0x20)  //BlockGroup
-            return this_->ParseBlockGroup(size, pos, len);
-
-        if (id == 0x23)  //SimpleBlock
-            return this_->ParseSimpleBlock(size, pos, len);
-
-        pos += size;  //consume payload
-        assert((cluster_stop < 0) || (pos <= cluster_stop));
-    }
-
-    assert(m_element_size > 0);
-
-    m_pos = pos;
-    assert((cluster_stop < 0) || (m_pos <= cluster_stop));
-
-    if (m_entries_count > 0)
-    {
-        const long idx = m_entries_count - 1;
-
-        const BlockEntry* const pLast = m_entries[idx];
-        assert(pLast);
-
-        const Block* const pBlock = pLast->GetBlock();
-        assert(pBlock);
-
-        const long long start = pBlock->m_start;
-
-        if ((total >= 0) && (start > total))
-            return -1;  //defend against trucated stream
-
-        const long long size = pBlock->m_size;
-
-        const long long stop = start + size;
-        assert((cluster_stop < 0) || (stop <= cluster_stop));
-
-        if ((total >= 0) && (stop > total))
-            return -1;  //defend against trucated stream
-    }
-
-    return 1;  //no more entries
-}
-
-
-long Cluster::ParseSimpleBlock(
-    long long block_size,
-    long long& pos,
-    long& len)
-{
-    const long long block_start = pos;
-    const long long block_stop = pos + block_size;
-
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long total, avail;
-
-    long status = pReader->Length(&total, &avail);
-
-    if (status < 0)  //error
-        return status;
-
-    assert((total < 0) || (avail <= total));
-
-    //parse track number
-
-    if ((pos + 1) > avail)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
 
     long long result = GetUIntLength(pReader, pos, len);
 
-    if (result < 0)  //error
-        return static_cast<long>(result);
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-    if (result > 0)  //weird
-        return E_BUFFER_NOT_FULL;
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-    if ((pos + len) > block_stop)
-        return E_FILE_FORMAT_INVALID;
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
 
     if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == 0x0F43B675)  // Cluster ID
+      break;
+
+    if (id == 0x0C53BB6B)  // Cues ID
+      break;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == 0x67) {  // TimeCode ID
+      len = static_cast<long>(size);
+
+      if ((pos + size) > avail)
         return E_BUFFER_NOT_FULL;
 
-    const long long track = ReadUInt(pReader, pos, len);
+      timecode = UnserializeUInt(pReader, pos, size);
 
-    if (track < 0) //error
-        return static_cast<long>(track);
+      if (timecode < 0)  // error (or underflow)
+        return static_cast<long>(timecode);
 
-    if (track == 0)
-        return E_FILE_FORMAT_INVALID;
+      new_pos = pos + size;
+
+      if (bBlock)
+        break;
+    } else if (id == 0x20) {  // BlockGroup ID
+      bBlock = true;
+      break;
+    } else if (id == 0x23) {  // SimpleBlock ID
+      bBlock = true;
+      break;
+    }
+
+    pos += size;  // consume payload
+    assert((cluster_stop < 0) || (pos <= cluster_stop));
+  }
+
+  assert((cluster_stop < 0) || (pos <= cluster_stop));
+
+  if (timecode < 0)  // no timecode found
+    return E_FILE_FORMAT_INVALID;
+
+  if (!bBlock)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = new_pos;  // designates position just beyond timecode payload
+  m_timecode = timecode;  // m_timecode >= 0 means we're partially loaded
+
+  if (cluster_size >= 0)
+    m_element_size = cluster_stop - m_element_start;
+
+  return 0;
+}
+
+long Cluster::Parse(long long& pos, long& len) const {
+  long status = Load(pos, len);
+
+  if (status < 0)
+    return status;
+
+  assert(m_pos >= m_element_start);
+  assert(m_timecode >= 0);
+  // assert(m_size > 0);
+  // assert(m_element_size > m_size);
+
+  const long long cluster_stop =
+      (m_element_size < 0) ? -1 : m_element_start + m_element_size;
+
+  if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
+    return 1;  // nothing else to do
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = m_pos;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    if ((total >= 0) && (pos >= total)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // weird
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if ((id == 0x0F43B675) || (id == 0x0C53BB6B)) {  // Cluster or Cues ID
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    // const long long block_start = pos;
+    const long long block_stop = pos + size;
+
+    if (cluster_stop >= 0) {
+      if (block_stop > cluster_stop) {
+        if ((id == 0x20) || (id == 0x23))
+          return E_FILE_FORMAT_INVALID;
+
+        pos = cluster_stop;
+        break;
+      }
+    } else if ((total >= 0) && (block_stop > total)) {
+      m_element_size = total - m_element_start;
+      pos = total;
+      break;
+    } else if (block_stop > avail) {
+      len = static_cast<long>(size);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    Cluster* const this_ = const_cast<Cluster*>(this);
+
+    if (id == 0x20)  // BlockGroup
+      return this_->ParseBlockGroup(size, pos, len);
+
+    if (id == 0x23)  // SimpleBlock
+      return this_->ParseSimpleBlock(size, pos, len);
+
+    pos += size;  // consume payload
+    assert((cluster_stop < 0) || (pos <= cluster_stop));
+  }
+
+  assert(m_element_size > 0);
+
+  m_pos = pos;
+  assert((cluster_stop < 0) || (m_pos <= cluster_stop));
+
+  if (m_entries_count > 0) {
+    const long idx = m_entries_count - 1;
+
+    const BlockEntry* const pLast = m_entries[idx];
+    assert(pLast);
+
+    const Block* const pBlock = pLast->GetBlock();
+    assert(pBlock);
+
+    const long long start = pBlock->m_start;
+
+    if ((total >= 0) && (start > total))
+      return -1;  // defend against trucated stream
+
+    const long long size = pBlock->m_size;
+
+    const long long stop = start + size;
+    assert((cluster_stop < 0) || (stop <= cluster_stop));
+
+    if ((total >= 0) && (stop > total))
+      return -1;  // defend against trucated stream
+  }
+
+  return 1;  // no more entries
+}
+
+long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
+                               long& len) {
+  const long long block_start = pos;
+  const long long block_stop = pos + block_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  // parse track number
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // weird
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long track = ReadUInt(pReader, pos, len);
+
+  if (track < 0)  // error
+    return static_cast<long>(track);
+
+  if (track == 0)
+    return E_FILE_FORMAT_INVALID;
 
 #if 0
     //TODO(matthewjheaney)
@@ -7518,228 +6525,213 @@
         return E_FILE_FORMAT_INVALID;
 #endif
 
-    pos += len;  //consume track number
+  pos += len;  // consume track number
 
-    if ((pos + 2) > block_stop)
-        return E_FILE_FORMAT_INVALID;
+  if ((pos + 2) > block_stop)
+    return E_FILE_FORMAT_INVALID;
 
-    if ((pos + 2) > avail)
-    {
-        len = 2;
-        return E_BUFFER_NOT_FULL;
-    }
+  if ((pos + 2) > avail) {
+    len = 2;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    pos += 2;  //consume timecode
+  pos += 2;  // consume timecode
 
-    if ((pos + 1) > block_stop)
-        return E_FILE_FORMAT_INVALID;
+  if ((pos + 1) > block_stop)
+    return E_FILE_FORMAT_INVALID;
 
-    if ((pos + 1) > avail)
-    {
-        len = 1;
-        return E_BUFFER_NOT_FULL;
-    }
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
 
-    unsigned char flags;
+  unsigned char flags;
 
-    status = pReader->Read(pos, 1, &flags);
+  status = pReader->Read(pos, 1, &flags);
 
-    if (status < 0)  //error or underflow
-    {
-        len = 1;
-        return status;
-    }
+  if (status < 0) {  // error or underflow
+    len = 1;
+    return status;
+  }
 
-    ++pos;  //consume flags byte
-    assert(pos <= avail);
+  ++pos;  // consume flags byte
+  assert(pos <= avail);
 
-    if (pos >= block_stop)
-        return E_FILE_FORMAT_INVALID;
+  if (pos >= block_stop)
+    return E_FILE_FORMAT_INVALID;
 
-    const int lacing = int(flags & 0x06) >> 1;
+  const int lacing = int(flags & 0x06) >> 1;
 
-    if ((lacing != 0) && (block_stop > avail))
-    {
-        len = static_cast<long>(block_stop - pos);
-        return E_BUFFER_NOT_FULL;
-    }
+  if ((lacing != 0) && (block_stop > avail)) {
+    len = static_cast<long>(block_stop - pos);
+    return E_BUFFER_NOT_FULL;
+  }
 
-    status = CreateBlock(0x23,  //simple block id
-                         block_start, block_size,
-                         0);  //DiscardPadding
+  status = CreateBlock(0x23,  // simple block id
+                       block_start, block_size,
+                       0);  // DiscardPadding
 
-    if (status != 0)
-        return status;
+  if (status != 0)
+    return status;
 
-    m_pos = block_stop;
+  m_pos = block_stop;
 
-    return 0;  //success
+  return 0;  // success
 }
 
+long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
+                              long& len) {
+  const long long payload_start = pos;
+  const long long payload_stop = pos + payload_size;
 
-long Cluster::ParseBlockGroup(
-    long long payload_size,
-    long long& pos,
-    long& len)
-{
-    const long long payload_start = pos;
-    const long long payload_stop = pos + payload_size;
+  IMkvReader* const pReader = m_pSegment->m_pReader;
 
-    IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long total, avail;
 
-    long long total, avail;
+  long status = pReader->Length(&total, &avail);
 
-    long status = pReader->Length(&total, &avail);
+  if (status < 0)  // error
+    return status;
 
-    if (status < 0)  //error
-        return status;
+  assert((total < 0) || (avail <= total));
 
-    assert((total < 0) || (avail <= total));
+  if ((total >= 0) && (payload_stop > total))
+    return E_FILE_FORMAT_INVALID;
 
-    if ((total >= 0) && (payload_stop > total))
-        return E_FILE_FORMAT_INVALID;
+  if (payload_stop > avail) {
+    len = static_cast<long>(payload_size);
+    return E_BUFFER_NOT_FULL;
+  }
 
-    if (payload_stop > avail)
-    {
-         len = static_cast<long>(payload_size);
-         return E_BUFFER_NOT_FULL;
+  long long discard_padding = 0;
+
+  while (pos < payload_stop) {
+    // parse sub-block element ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
 
-    long long discard_padding = 0;
+    long long result = GetUIntLength(pReader, pos, len);
 
-    while (pos < payload_stop)
-    {
-        //parse sub-block element ID
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        long long result = GetUIntLength(pReader, pos, len);
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+    const long long id = ReadUInt(pReader, pos, len);
 
-        if ((pos + len) > payload_stop)
-            return E_FILE_FORMAT_INVALID;
+    if (id < 0)  // error
+      return static_cast<long>(id);
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+    if (id == 0)  // not a value ID
+      return E_FILE_FORMAT_INVALID;
 
-        const long long id = ReadUInt(pReader, pos, len);
+    pos += len;  // consume ID field
 
-        if (id < 0) //error
-            return static_cast<long>(id);
+    // Parse Size
 
-        if (id == 0)  //not a value ID
-            return E_FILE_FORMAT_INVALID;
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-        pos += len;  //consume ID field
+    result = GetUIntLength(pReader, pos, len);
 
-        //Parse Size
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        result = GetUIntLength(pReader, pos, len);
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+    const long long size = ReadUInt(pReader, pos, len);
 
-        if ((pos + len) > payload_stop)
-            return E_FILE_FORMAT_INVALID;
+    if (size < 0)  // error
+      return static_cast<long>(size);
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+    pos += len;  // consume size field
 
-        const long long size = ReadUInt(pReader, pos, len);
+    // pos now points to start of sub-block group payload
 
-        if (size < 0)  //error
-            return static_cast<long>(size);
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
 
-        pos += len;  //consume size field
+    if (size == 0)  // weird
+      continue;
 
-        //pos now points to start of sub-block group payload
+    const long long unknown_size = (1LL << (7 * len)) - 1;
 
-        if (pos > payload_stop)
-            return E_FILE_FORMAT_INVALID;
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
 
-        if (size == 0)  //weird
-            continue;
+    if (id == 0x35A2) {  // DiscardPadding
+      result = GetUIntLength(pReader, pos, len);
 
-        const long long unknown_size = (1LL << (7 * len)) - 1;
+      if (result < 0)  // error
+        return static_cast<long>(result);
 
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;
+      status = UnserializeInt(pReader, pos, len, discard_padding);
 
-        if (id == 0x35A2)  //DiscardPadding
-        {
-            result = GetUIntLength(pReader, pos, len);
+      if (status < 0)  // error
+        return status;
+    }
 
-            if (result < 0)  //error
-                return static_cast<long>(result);
+    if (id != 0x21) {  // sub-part of BlockGroup is not a Block
+      pos += size;  // consume sub-part of block group
 
-            status = UnserializeInt(pReader, pos, len, discard_padding);
+      if (pos > payload_stop)
+        return E_FILE_FORMAT_INVALID;
 
-            if (status < 0)  //error
-                return status;
-        }
+      continue;
+    }
 
-        if (id != 0x21)  //sub-part of BlockGroup is not a Block
-        {
-            pos += size;  //consume sub-part of block group
+    const long long block_stop = pos + size;
 
-            if (pos > payload_stop)
-                return E_FILE_FORMAT_INVALID;
+    if (block_stop > payload_stop)
+      return E_FILE_FORMAT_INVALID;
 
-            continue;
-        }
+    // parse track number
 
-        const long long block_stop = pos + size;
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
 
-        if (block_stop > payload_stop)
-            return E_FILE_FORMAT_INVALID;
+    result = GetUIntLength(pReader, pos, len);
 
-        //parse track number
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
 
-        result = GetUIntLength(pReader, pos, len);
+    if ((pos + len) > block_stop)
+      return E_FILE_FORMAT_INVALID;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
+    const long long track = ReadUInt(pReader, pos, len);
 
-        if ((pos + len) > block_stop)
-            return E_FILE_FORMAT_INVALID;
+    if (track < 0)  // error
+      return static_cast<long>(track);
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long track = ReadUInt(pReader, pos, len);
-
-        if (track < 0) //error
-            return static_cast<long>(track);
-
-        if (track == 0)
-            return E_FILE_FORMAT_INVALID;
+    if (track == 0)
+      return E_FILE_FORMAT_INVALID;
 
 #if 0
         //TODO(matthewjheaney)
@@ -7771,213 +6763,173 @@
             return E_FILE_FORMAT_INVALID;
 #endif
 
-        pos += len;  //consume track number
+    pos += len;  // consume track number
 
-        if ((pos + 2) > block_stop)
-            return E_FILE_FORMAT_INVALID;
+    if ((pos + 2) > block_stop)
+      return E_FILE_FORMAT_INVALID;
 
-        if ((pos + 2) > avail)
-        {
-            len = 2;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        pos += 2;  //consume timecode
-
-        if ((pos + 1) > block_stop)
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        unsigned char flags;
-
-        status = pReader->Read(pos, 1, &flags);
-
-        if (status < 0)  //error or underflow
-        {
-            len = 1;
-            return status;
-        }
-
-        ++pos;  //consume flags byte
-        assert(pos <= avail);
-
-        if (pos >= block_stop)
-            return E_FILE_FORMAT_INVALID;
-
-        const int lacing = int(flags & 0x06) >> 1;
-
-        if ((lacing != 0) && (block_stop > avail))
-        {
-            len = static_cast<long>(block_stop - pos);
-            return E_BUFFER_NOT_FULL;
-        }
-
-        pos = block_stop;  //consume block-part of block group
-        assert(pos <= payload_stop);
+    if ((pos + 2) > avail) {
+      len = 2;
+      return E_BUFFER_NOT_FULL;
     }
 
-    assert(pos == payload_stop);
+    pos += 2;  // consume timecode
 
-    status = CreateBlock(0x20,  //BlockGroup ID
-                         payload_start, payload_size,
-                         discard_padding);
-    if (status != 0)
-        return status;
+    if ((pos + 1) > block_stop)
+      return E_FILE_FORMAT_INVALID;
 
-    m_pos = payload_stop;
-
-    return 0;  //success
-}
-
-
-long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const
-{
-    assert(m_pos >= m_element_start);
-
-    pEntry = NULL;
-
-    if (index < 0)
-        return -1;  //generic error
-
-    if (m_entries_count < 0)
-        return E_BUFFER_NOT_FULL;
-
-    assert(m_entries);
-    assert(m_entries_size > 0);
-    assert(m_entries_count <= m_entries_size);
-
-    if (index < m_entries_count)
-    {
-        pEntry = m_entries[index];
-        assert(pEntry);
-
-        return 1;  //found entry
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
 
-    if (m_element_size < 0)        //we don't know cluster end yet
-        return E_BUFFER_NOT_FULL;  //underflow
+    unsigned char flags;
 
-    const long long element_stop = m_element_start + m_element_size;
+    status = pReader->Read(pos, 1, &flags);
 
-    if (m_pos >= element_stop)
-        return 0;  //nothing left to parse
-
-    return E_BUFFER_NOT_FULL;  //underflow, since more remains to be parsed
-}
-
-
-Cluster* Cluster::Create(
-    Segment* pSegment,
-    long idx,
-    long long off)
-    //long long element_size)
-{
-    assert(pSegment);
-    assert(off >= 0);
-
-    const long long element_start = pSegment->m_start + off;
-
-    Cluster* const pCluster = new Cluster(pSegment,
-                                          idx,
-                                          element_start);
-                                          //element_size);
-    assert(pCluster);
-
-    return pCluster;
-}
-
-
-Cluster::Cluster() :
-    m_pSegment(NULL),
-    m_element_start(0),
-    m_index(0),
-    m_pos(0),
-    m_element_size(0),
-    m_timecode(0),
-    m_entries(NULL),
-    m_entries_size(0),
-    m_entries_count(0)  //means "no entries"
-{
-}
-
-
-Cluster::Cluster(
-    Segment* pSegment,
-    long idx,
-    long long element_start
-    /* long long element_size */ ) :
-    m_pSegment(pSegment),
-    m_element_start(element_start),
-    m_index(idx),
-    m_pos(element_start),
-    m_element_size(-1 /* element_size */ ),
-    m_timecode(-1),
-    m_entries(NULL),
-    m_entries_size(0),
-    m_entries_count(-1)  //means "has not been parsed yet"
-{
-}
-
-
-Cluster::~Cluster()
-{
-    if (m_entries_count <= 0)
-        return;
-
-    BlockEntry** i = m_entries;
-    BlockEntry** const j = m_entries + m_entries_count;
-
-    while (i != j)
-    {
-         BlockEntry* p = *i++;
-         assert(p);
-
-         delete p;
+    if (status < 0) {  // error or underflow
+      len = 1;
+      return status;
     }
 
-    delete[] m_entries;
+    ++pos;  // consume flags byte
+    assert(pos <= avail);
+
+    if (pos >= block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const int lacing = int(flags & 0x06) >> 1;
+
+    if ((lacing != 0) && (block_stop > avail)) {
+      len = static_cast<long>(block_stop - pos);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos = block_stop;  // consume block-part of block group
+    assert(pos <= payload_stop);
+  }
+
+  assert(pos == payload_stop);
+
+  status = CreateBlock(0x20,  // BlockGroup ID
+                       payload_start, payload_size, discard_padding);
+  if (status != 0)
+    return status;
+
+  m_pos = payload_stop;
+
+  return 0;  // success
 }
 
+long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
+  assert(m_pos >= m_element_start);
 
-bool Cluster::EOS() const
+  pEntry = NULL;
+
+  if (index < 0)
+    return -1;  // generic error
+
+  if (m_entries_count < 0)
+    return E_BUFFER_NOT_FULL;
+
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count <= m_entries_size);
+
+  if (index < m_entries_count) {
+    pEntry = m_entries[index];
+    assert(pEntry);
+
+    return 1;  // found entry
+  }
+
+  if (m_element_size < 0)  // we don't know cluster end yet
+    return E_BUFFER_NOT_FULL;  // underflow
+
+  const long long element_stop = m_element_start + m_element_size;
+
+  if (m_pos >= element_stop)
+    return 0;  // nothing left to parse
+
+  return E_BUFFER_NOT_FULL;  // underflow, since more remains to be parsed
+}
+
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off)
+// long long element_size)
 {
-    return (m_pSegment == NULL);
+  assert(pSegment);
+  assert(off >= 0);
+
+  const long long element_start = pSegment->m_start + off;
+
+  Cluster* const pCluster = new Cluster(pSegment, idx, element_start);
+  // element_size);
+  assert(pCluster);
+
+  return pCluster;
 }
 
+Cluster::Cluster()
+    : m_pSegment(NULL),
+      m_element_start(0),
+      m_index(0),
+      m_pos(0),
+      m_element_size(0),
+      m_timecode(0),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(0)  // means "no entries"
+{}
 
-long Cluster::GetIndex() const
-{
-    return m_index;
+Cluster::Cluster(Segment* pSegment, long idx, long long element_start
+                 /* long long element_size */)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_index(idx),
+      m_pos(element_start),
+      m_element_size(-1 /* element_size */),
+      m_timecode(-1),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(-1)  // means "has not been parsed yet"
+{}
+
+Cluster::~Cluster() {
+  if (m_entries_count <= 0)
+    return;
+
+  BlockEntry** i = m_entries;
+  BlockEntry** const j = m_entries + m_entries_count;
+
+  while (i != j) {
+    BlockEntry* p = *i++;
+    assert(p);
+
+    delete p;
+  }
+
+  delete[] m_entries;
 }
 
+bool Cluster::EOS() const { return (m_pSegment == NULL); }
 
-long long Cluster::GetPosition() const
-{
-    const long long pos = m_element_start - m_pSegment->m_start;
-    assert(pos >= 0);
+long Cluster::GetIndex() const { return m_index; }
 
-    return pos;
+long long Cluster::GetPosition() const {
+  const long long pos = m_element_start - m_pSegment->m_start;
+  assert(pos >= 0);
+
+  return pos;
 }
 
-
-long long Cluster::GetElementSize() const
-{
-    return m_element_size;
-}
-
+long long Cluster::GetElementSize() const { return m_element_size; }
 
 #if 0
 bool Cluster::HasBlockEntries(
     const Segment* pSegment,
-    long long off)  //relative to start of segment payload
-{
+    long long off) {
     assert(pSegment);
-    assert(off >= 0);  //relative to segment
+    assert(off >= 0);  //relative to start of segment payload
 
     IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -8034,631 +6986,558 @@
 }
 #endif
 
-
 long Cluster::HasBlockEntries(
     const Segment* pSegment,
-    long long off,  //relative to start of segment payload
-    long long& pos,
-    long& len)
-{
-    assert(pSegment);
-    assert(off >= 0);  //relative to segment
+    long long off,  // relative to start of segment payload
+    long long& pos, long& len) {
+  assert(pSegment);
+  assert(off >= 0);  // relative to segment
 
-    IMkvReader* const pReader = pSegment->m_pReader;
+  IMkvReader* const pReader = pSegment->m_pReader;
 
-    long long total, avail;
+  long long total, avail;
 
-    long status = pReader->Length(&total, &avail);
+  long status = pReader->Length(&total, &avail);
 
-    if (status < 0)  //error
-        return status;
+  if (status < 0)  // error
+    return status;
 
-    assert((total < 0) || (avail <= total));
+  assert((total < 0) || (avail <= total));
 
-    pos = pSegment->m_start + off;  //absolute
+  pos = pSegment->m_start + off;  // absolute
 
-    if ((total >= 0) && (pos >= total))
-        return 0;  //we don't even have a complete cluster
+  if ((total >= 0) && (pos >= total))
+    return 0;  // we don't even have a complete cluster
 
-    const long long segment_stop =
-        (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
+  const long long segment_stop =
+      (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
 
-    long long cluster_stop = -1;  //interpreted later to mean "unknown size"
+  long long cluster_stop = -1;  // interpreted later to mean "unknown size"
 
-    {
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        long long result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //need more data
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((total >= 0) && ((pos + len) > total))
-            return 0;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long id = ReadUInt(pReader, pos, len);
-
-        if (id < 0)  //error
-            return static_cast<long>(id);
-
-        if (id != 0x0F43B675)  //weird: not cluster ID
-            return -1;         //generic error
-
-        pos += len;  //consume Cluster ID field
-
-        //read size field
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //weird
-            return E_BUFFER_NOT_FULL;
-
-        if ((segment_stop >= 0) && ((pos + len) > segment_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((total >= 0) && ((pos + len) > total))
-            return 0;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        if (size == 0)
-            return 0;  //cluster does not have entries
-
-        pos += len;  //consume size field
-
-        //pos now points to start of payload
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size != unknown_size)
-        {
-            cluster_stop = pos + size;
-            assert(cluster_stop >= 0);
-
-            if ((segment_stop >= 0) && (cluster_stop > segment_stop))
-                return E_FILE_FORMAT_INVALID;
-
-            if ((total >= 0) && (cluster_stop > total))
-                //return E_FILE_FORMAT_INVALID;  //too conservative
-                return 0;  //cluster does not have any entries
-        }
+  {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
 
-    for (;;)
-    {
-        if ((cluster_stop >= 0) && (pos >= cluster_stop))
-            return 0;  //no entries detected
+    long long result = GetUIntLength(pReader, pos, len);
 
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
+    if (result < 0)  // error
+      return static_cast<long>(result);
 
-        long long result = GetUIntLength(pReader, pos, len);
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
 
-        if (result < 0)  //error
-            return static_cast<long>(result);
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
 
-        if (result > 0)  //need more data
-            return E_BUFFER_NOT_FULL;
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
 
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
 
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
+    const long long id = ReadUInt(pReader, pos, len);
 
-        const long long id = ReadUInt(pReader, pos, len);
+    if (id < 0)  // error
+      return static_cast<long>(id);
 
-        if (id < 0)  //error
-            return static_cast<long>(id);
+    if (id != 0x0F43B675)  // weird: not cluster ID
+      return -1;  // generic error
 
-        //This is the distinguished set of ID's we use to determine
-        //that we have exhausted the sub-element's inside the cluster
-        //whose ID we parsed earlier.
+    pos += len;  // consume Cluster ID field
 
-        if (id == 0x0F43B675)  //Cluster ID
-            return 0;  //no entries found
+    // read size field
 
-        if (id == 0x0C53BB6B)  //Cues ID
-            return 0;  //no entries found
-
-        pos += len;  //consume id field
-
-        if ((cluster_stop >= 0) && (pos >= cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        //read size field
-
-        if ((pos + 1) > avail)
-        {
-            len = 1;
-            return E_BUFFER_NOT_FULL;
-        }
-
-        result = GetUIntLength(pReader, pos, len);
-
-        if (result < 0)  //error
-            return static_cast<long>(result);
-
-        if (result > 0)  //underflow
-            return E_BUFFER_NOT_FULL;
-
-        if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if ((pos + len) > avail)
-            return E_BUFFER_NOT_FULL;
-
-        const long long size = ReadUInt(pReader, pos, len);
-
-        if (size < 0)  //error
-            return static_cast<long>(size);
-
-        pos += len;  //consume size field
-
-        //pos now points to start of payload
-
-        if ((cluster_stop >= 0) && (pos > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if (size == 0)  //weird
-            continue;
-
-        const long long unknown_size = (1LL << (7 * len)) - 1;
-
-        if (size == unknown_size)
-            return E_FILE_FORMAT_INVALID;  //not supported inside cluster
-
-        if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
-            return E_FILE_FORMAT_INVALID;
-
-        if (id == 0x20)  //BlockGroup ID
-            return 1;    //have at least one entry
-
-        if (id == 0x23)  //SimpleBlock ID
-            return 1;    //have at least one entry
-
-        pos += size;  //consume payload
-        assert((cluster_stop < 0) || (pos <= cluster_stop));
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
     }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    if (size == 0)
+      return 0;  // cluster does not have entries
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size != unknown_size) {
+      cluster_stop = pos + size;
+      assert(cluster_stop >= 0);
+
+      if ((segment_stop >= 0) && (cluster_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((total >= 0) && (cluster_stop > total))
+        // return E_FILE_FORMAT_INVALID;  //too conservative
+        return 0;  // cluster does not have any entries
+    }
+  }
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return 0;  // no entries detected
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == 0x0F43B675)  // Cluster ID
+      return 0;  // no entries found
+
+    if (id == 0x0C53BB6B)  // Cues ID
+      return 0;  // no entries found
+
+    pos += len;  // consume id field
+
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // underflow
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;  // not supported inside cluster
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == 0x20)  // BlockGroup ID
+      return 1;  // have at least one entry
+
+    if (id == 0x23)  // SimpleBlock ID
+      return 1;  // have at least one entry
+
+    pos += size;  // consume payload
+    assert((cluster_stop < 0) || (pos <= cluster_stop));
+  }
 }
 
+long long Cluster::GetTimeCode() const {
+  long long pos;
+  long len;
 
-long long Cluster::GetTimeCode() const
-{
+  const long status = Load(pos, len);
+
+  if (status < 0)  // error
+    return status;
+
+  return m_timecode;
+}
+
+long long Cluster::GetTime() const {
+  const long long tc = GetTimeCode();
+
+  if (tc < 0)
+    return tc;
+
+  const SegmentInfo* const pInfo = m_pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long t = m_timecode * scale;
+
+  return t;
+}
+
+long long Cluster::GetFirstTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetFirst(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long long Cluster::GetLastTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetLast(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long Cluster::CreateBlock(long long id,
+                          long long pos,  // absolute pos of payload
+                          long long size, long long discard_padding) {
+  assert((id == 0x20) || (id == 0x23));  // BlockGroup or SimpleBlock
+
+  if (m_entries_count < 0) {  // haven't parsed anything yet
+    assert(m_entries == NULL);
+    assert(m_entries_size == 0);
+
+    m_entries_size = 1024;
+    m_entries = new BlockEntry* [m_entries_size];
+
+    m_entries_count = 0;
+  } else {
+    assert(m_entries);
+    assert(m_entries_size > 0);
+    assert(m_entries_count <= m_entries_size);
+
+    if (m_entries_count >= m_entries_size) {
+      const long entries_size = 2 * m_entries_size;
+
+      BlockEntry** const entries = new BlockEntry* [entries_size];
+      assert(entries);
+
+      BlockEntry** src = m_entries;
+      BlockEntry** const src_end = src + m_entries_count;
+
+      BlockEntry** dst = entries;
+
+      while (src != src_end)
+        *dst++ = *src++;
+
+      delete[] m_entries;
+
+      m_entries = entries;
+      m_entries_size = entries_size;
+    }
+  }
+
+  if (id == 0x20)  // BlockGroup ID
+    return CreateBlockGroup(pos, size, discard_padding);
+  else  // SimpleBlock ID
+    return CreateSimpleBlock(pos, size);
+}
+
+long Cluster::CreateBlockGroup(long long start_offset, long long size,
+                               long long discard_padding) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = start_offset;
+  const long long stop = start_offset + size;
+
+  // For WebM files, there is a bias towards previous reference times
+  //(in order to support alt-ref frames, which refer back to the previous
+  // keyframe).  Normally a 0 value is not possible, but here we tenatively
+  // allow 0 as the value of a reference frame, with the interpretation
+  // that this is a "previous" reference time.
+
+  long long prev = 1;  // nonce
+  long long next = 0;  // nonce
+  long long duration = -1;  // really, this is unsigned
+
+  long long bpos = -1;
+  long long bsize = -1;
+
+  while (pos < stop) {
+    long len;
+    const long long id = ReadUInt(pReader, pos, len);
+    assert(id >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume size
+
+    if (id == 0x21) {  // Block ID
+      if (bpos < 0) {  // Block ID
+        bpos = pos;
+        bsize = size;
+      }
+    } else if (id == 0x1B) {  // Duration ID
+      assert(size <= 8);
+
+      duration = UnserializeUInt(pReader, pos, size);
+      assert(duration >= 0);  // TODO
+    } else if (id == 0x7B) {  // ReferenceBlock
+      assert(size <= 8);
+      const long size_ = static_cast<long>(size);
+
+      long long time;
+
+      long status = UnserializeInt(pReader, pos, size_, time);
+      assert(status == 0);
+      if (status != 0)
+        return -1;
+
+      if (time <= 0)  // see note above
+        prev = time;
+      else  // weird
+        next = time;
+    }
+
+    pos += size;  // consume payload
+    assert(pos <= stop);
+  }
+
+  assert(pos == stop);
+  assert(bpos >= 0);
+  assert(bsize >= 0);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow)
+      BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {  // success
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::CreateSimpleBlock(long long st, long long sz) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::GetFirst(const BlockEntry*& pFirst) const {
+  if (m_entries_count <= 0) {
     long long pos;
     long len;
 
-    const long status = Load(pos, len);
+    const long status = Parse(pos, len);
 
-    if (status < 0) //error
-        return status;
-
-    return m_timecode;
-}
-
-
-long long Cluster::GetTime() const
-{
-    const long long tc = GetTimeCode();
-
-    if (tc < 0)
-        return tc;
-
-    const SegmentInfo* const pInfo = m_pSegment->GetInfo();
-    assert(pInfo);
-
-    const long long scale = pInfo->GetTimeCodeScale();
-    assert(scale >= 1);
-
-    const long long t = m_timecode * scale;
-
-    return t;
-}
-
-
-long long Cluster::GetFirstTime() const
-{
-    const BlockEntry* pEntry;
-
-    const long status = GetFirst(pEntry);
-
-    if (status < 0)  //error
-        return status;
-
-    if (pEntry == NULL)  //empty cluster
-        return GetTime();
-
-    const Block* const pBlock = pEntry->GetBlock();
-    assert(pBlock);
-
-    return pBlock->GetTime(this);
-}
-
-
-long long Cluster::GetLastTime() const
-{
-    const BlockEntry* pEntry;
-
-    const long status = GetLast(pEntry);
-
-    if (status < 0)  //error
-        return status;
-
-    if (pEntry == NULL)  //empty cluster
-        return GetTime();
-
-    const Block* const pBlock = pEntry->GetBlock();
-    assert(pBlock);
-
-    return pBlock->GetTime(this);
-}
-
-
-long Cluster::CreateBlock(
-    long long id,
-    long long pos,   //absolute pos of payload
-    long long size,
-    long long discard_padding)
-{
-    assert((id == 0x20) || (id == 0x23));  //BlockGroup or SimpleBlock
-
-    if (m_entries_count < 0)  //haven't parsed anything yet
-    {
-        assert(m_entries == NULL);
-        assert(m_entries_size == 0);
-
-        m_entries_size = 1024;
-        m_entries = new BlockEntry*[m_entries_size];
-
-        m_entries_count = 0;
-    }
-    else
-    {
-        assert(m_entries);
-        assert(m_entries_size > 0);
-        assert(m_entries_count <= m_entries_size);
-
-        if (m_entries_count >= m_entries_size)
-        {
-            const long entries_size = 2 * m_entries_size;
-
-            BlockEntry** const entries = new BlockEntry*[entries_size];
-            assert(entries);
-
-            BlockEntry** src = m_entries;
-            BlockEntry** const src_end = src + m_entries_count;
-
-            BlockEntry** dst = entries;
-
-            while (src != src_end)
-                *dst++ = *src++;
-
-            delete[] m_entries;
-
-            m_entries = entries;
-            m_entries_size = entries_size;
-        }
+    if (status < 0) {  // error
+      pFirst = NULL;
+      return status;
     }
 
-    if (id == 0x20)  //BlockGroup ID
-        return CreateBlockGroup(pos, size, discard_padding);
-    else  //SimpleBlock ID
-        return CreateSimpleBlock(pos, size);
+    if (m_entries_count <= 0) {  // empty cluster
+      pFirst = NULL;
+      return 0;
+    }
+  }
+
+  assert(m_entries);
+
+  pFirst = m_entries[0];
+  assert(pFirst);
+
+  return 0;  // success
 }
 
+long Cluster::GetLast(const BlockEntry*& pLast) const {
+  for (;;) {
+    long long pos;
+    long len;
 
-long Cluster::CreateBlockGroup(
-    long long start_offset,
-    long long size,
-    long long discard_padding)
-{
-    assert(m_entries);
-    assert(m_entries_size > 0);
-    assert(m_entries_count >= 0);
-    assert(m_entries_count < m_entries_size);
+    const long status = Parse(pos, len);
 
-    IMkvReader* const pReader = m_pSegment->m_pReader;
-
-    long long pos = start_offset;
-    const long long stop = start_offset + size;
-
-    //For WebM files, there is a bias towards previous reference times
-    //(in order to support alt-ref frames, which refer back to the previous
-    //keyframe).  Normally a 0 value is not possible, but here we tenatively
-    //allow 0 as the value of a reference frame, with the interpretation
-    //that this is a "previous" reference time.
-
-    long long prev = 1;  //nonce
-    long long next = 0;  //nonce
-    long long duration = -1;  //really, this is unsigned
-
-    long long bpos = -1;
-    long long bsize = -1;
-
-    while (pos < stop)
-    {
-        long len;
-        const long long id = ReadUInt(pReader, pos, len);
-        assert(id >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume ID
-
-        const long long size = ReadUInt(pReader, pos, len);
-        assert(size >= 0);  //TODO
-        assert((pos + len) <= stop);
-
-        pos += len;  //consume size
-
-        if (id == 0x21) //Block ID
-        {
-            if (bpos < 0) //Block ID
-            {
-                bpos = pos;
-                bsize = size;
-            }
-        }
-        else if (id == 0x1B)  //Duration ID
-        {
-            assert(size <= 8);
-
-            duration = UnserializeUInt(pReader, pos, size);
-            assert(duration >= 0);  //TODO
-        }
-        else if (id == 0x7B)  //ReferenceBlock
-        {
-            assert(size <= 8);
-            const long size_ = static_cast<long>(size);
-
-            long long time;
-
-            long status = UnserializeInt(pReader, pos, size_, time);
-            assert(status == 0);
-            if (status != 0)
-                return -1;
-
-            if (time <= 0)  //see note above
-                prev = time;
-            else  //weird
-                next = time;
-        }
-
-        pos += size;  //consume payload
-        assert(pos <= stop);
+    if (status < 0) {  // error
+      pLast = NULL;
+      return status;
     }
 
-    assert(pos == stop);
-    assert(bpos >= 0);
-    assert(bsize >= 0);
+    if (status > 0)  // no new block
+      break;
+  }
 
-    const long idx = m_entries_count;
-
-    BlockEntry** const ppEntry = m_entries + idx;
-    BlockEntry*& pEntry = *ppEntry;
-
-    pEntry = new (std::nothrow) BlockGroup(
-                                  this,
-                                  idx,
-                                  bpos,
-                                  bsize,
-                                  prev,
-                                  next,
-                                  duration,
-                                  discard_padding);
-
-    if (pEntry == NULL)
-        return -1;  //generic error
-
-    BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
-
-    const long status = p->Parse();
-
-    if (status == 0)  //success
-    {
-        ++m_entries_count;
-        return 0;
-    }
-
-    delete pEntry;
-    pEntry = 0;
-
-    return status;
-}
-
-
-
-long Cluster::CreateSimpleBlock(
-    long long st,
-    long long sz)
-{
-    assert(m_entries);
-    assert(m_entries_size > 0);
-    assert(m_entries_count >= 0);
-    assert(m_entries_count < m_entries_size);
-
-    const long idx = m_entries_count;
-
-    BlockEntry** const ppEntry = m_entries + idx;
-    BlockEntry*& pEntry = *ppEntry;
-
-    pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
-
-    if (pEntry == NULL)
-        return -1;  //generic error
-
-    SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
-
-    const long status = p->Parse();
-
-    if (status == 0)
-    {
-        ++m_entries_count;
-        return 0;
-    }
-
-    delete pEntry;
-    pEntry = 0;
-
-    return status;
-}
-
-
-long Cluster::GetFirst(const BlockEntry*& pFirst) const
-{
-    if (m_entries_count <= 0)
-    {
-        long long pos;
-        long len;
-
-        const long status = Parse(pos, len);
-
-        if (status < 0)  //error
-        {
-            pFirst = NULL;
-            return status;
-        }
-
-        if (m_entries_count <= 0)  //empty cluster
-        {
-            pFirst = NULL;
-            return 0;
-        }
-    }
-
-    assert(m_entries);
-
-    pFirst = m_entries[0];
-    assert(pFirst);
-
-    return 0;  //success
-}
-
-long Cluster::GetLast(const BlockEntry*& pLast) const
-{
-    for (;;)
-    {
-        long long pos;
-        long len;
-
-        const long status = Parse(pos, len);
-
-        if (status < 0)  //error
-        {
-            pLast = NULL;
-            return status;
-        }
-
-        if (status > 0)  //no new block
-            break;
-    }
-
-    if (m_entries_count <= 0)
-    {
-        pLast = NULL;
-        return 0;
-    }
-
-    assert(m_entries);
-
-    const long idx = m_entries_count - 1;
-
-    pLast = m_entries[idx];
-    assert(pLast);
-
+  if (m_entries_count <= 0) {
+    pLast = NULL;
     return 0;
+  }
+
+  assert(m_entries);
+
+  const long idx = m_entries_count - 1;
+
+  pLast = m_entries[idx];
+  assert(pLast);
+
+  return 0;
 }
 
+long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const {
+  assert(pCurr);
+  assert(m_entries);
+  assert(m_entries_count > 0);
 
-long Cluster::GetNext(
-    const BlockEntry* pCurr,
-    const BlockEntry*& pNext) const
-{
-    assert(pCurr);
+  size_t idx = pCurr->GetIndex();
+  assert(idx < size_t(m_entries_count));
+  assert(m_entries[idx] == pCurr);
+
+  ++idx;
+
+  if (idx >= size_t(m_entries_count)) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pNext = NULL;
+      return status;
+    }
+
+    if (status > 0) {
+      pNext = NULL;
+      return 0;
+    }
+
     assert(m_entries);
     assert(m_entries_count > 0);
-
-    size_t idx = pCurr->GetIndex();
     assert(idx < size_t(m_entries_count));
-    assert(m_entries[idx] == pCurr);
+  }
 
-    ++idx;
+  pNext = m_entries[idx];
+  assert(pNext);
 
-    if (idx >= size_t(m_entries_count))
-    {
-        long long pos;
-        long len;
-
-        const long status = Parse(pos, len);
-
-        if (status < 0)  //error
-        {
-            pNext = NULL;
-            return status;
-        }
-
-        if (status > 0)
-        {
-            pNext = NULL;
-            return 0;
-        }
-
-        assert(m_entries);
-        assert(m_entries_count > 0);
-        assert(idx < size_t(m_entries_count));
-    }
-
-    pNext = m_entries[idx];
-    assert(pNext);
-
-    return 0;
+  return 0;
 }
 
+long Cluster::GetEntryCount() const { return m_entries_count; }
 
-long Cluster::GetEntryCount() const
-{
-    return m_entries_count;
-}
+const BlockEntry* Cluster::GetEntry(const Track* pTrack,
+                                    long long time_ns) const {
+  assert(pTrack);
 
-
-const BlockEntry* Cluster::GetEntry(
-    const Track* pTrack,
-    long long time_ns) const
-{
-    assert(pTrack);
-
-    if (m_pSegment == NULL)  //this is the special EOS cluster
-        return pTrack->GetEOS();
+  if (m_pSegment == NULL)  // this is the special EOS cluster
+    return pTrack->GetEOS();
 
 #if 0
 
@@ -8711,76 +7590,66 @@
 
 #else
 
-    const BlockEntry* pResult = pTrack->GetEOS();
+  const BlockEntry* pResult = pTrack->GetEOS();
 
-    long index = 0;
+  long index = 0;
 
-    for (;;)
-    {
-        if (index >= m_entries_count)
-        {
-            long long pos;
-            long len;
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
 
-            const long status = Parse(pos, len);
-            assert(status >= 0);
+      const long status = Parse(pos, len);
+      assert(status >= 0);
 
-            if (status > 0)  //completely parsed, and no more entries
-                return pResult;
+      if (status > 0)  // completely parsed, and no more entries
+        return pResult;
 
-            if (status < 0)  //should never happen
-                return 0;
+      if (status < 0)  // should never happen
+        return 0;
 
-            assert(m_entries);
-            assert(index < m_entries_count);
-        }
-
-        const BlockEntry* const pEntry = m_entries[index];
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if (pBlock->GetTrackNumber() != pTrack->GetNumber())
-        {
-            ++index;
-            continue;
-        }
-
-        if (pTrack->VetEntry(pEntry))
-        {
-            if (time_ns < 0)  //just want first candidate block
-                return pEntry;
-
-            const long long ns = pBlock->GetTime(this);
-
-            if (ns > time_ns)
-                return pResult;
-
-            pResult = pEntry;  //have a candidate
-        }
-        else if (time_ns >= 0)
-        {
-            const long long ns = pBlock->GetTime(this);
-
-            if (ns > time_ns)
-                return pResult;
-        }
-
-        ++index;
+      assert(m_entries);
+      assert(index < m_entries_count);
     }
 
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != pTrack->GetNumber()) {
+      ++index;
+      continue;
+    }
+
+    if (pTrack->VetEntry(pEntry)) {
+      if (time_ns < 0)  // just want first candidate block
+        return pEntry;
+
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+
+      pResult = pEntry;  // have a candidate
+    } else if (time_ns >= 0) {
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+    }
+
+    ++index;
+  }
+
 #endif
 }
 
-
-const BlockEntry*
-Cluster::GetEntry(
-    const CuePoint& cp,
-    const CuePoint::TrackPosition& tp) const
-{
-    assert(m_pSegment);
+const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) const {
+  assert(m_pSegment);
 
 #if 0
 
@@ -8871,114 +7740,105 @@
 
 #else
 
-    const long long tc = cp.GetTimeCode();
+  const long long tc = cp.GetTimeCode();
 
-    if (tp.m_block > 0)
-    {
-        const long block = static_cast<long>(tp.m_block);
-        const long index = block - 1;
+  if (tp.m_block > 0) {
+    const long block = static_cast<long>(tp.m_block);
+    const long index = block - 1;
 
-        while (index >= m_entries_count)
-        {
-            long long pos;
-            long len;
+    while (index >= m_entries_count) {
+      long long pos;
+      long len;
 
-            const long status = Parse(pos, len);
+      const long status = Parse(pos, len);
 
-            if (status < 0)  //TODO: can this happen?
-                return NULL;
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
 
-            if (status > 0)  //nothing remains to be parsed
-                return NULL;
-        }
-
-        const BlockEntry* const pEntry = m_entries[index];
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if ((pBlock->GetTrackNumber() == tp.m_track) &&
-            (pBlock->GetTimeCode(this) == tc))
-        {
-            return pEntry;
-        }
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
     }
 
-    long index = 0;
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
 
-    for (;;)
-    {
-        if (index >= m_entries_count)
-        {
-            long long pos;
-            long len;
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
 
-            const long status = Parse(pos, len);
-
-            if (status < 0)  //TODO: can this happen?
-                return NULL;
-
-            if (status > 0)  //nothing remains to be parsed
-                return NULL;
-
-            assert(m_entries);
-            assert(index < m_entries_count);
-        }
-
-        const BlockEntry* const pEntry = m_entries[index];
-        assert(pEntry);
-        assert(!pEntry->EOS());
-
-        const Block* const pBlock = pEntry->GetBlock();
-        assert(pBlock);
-
-        if (pBlock->GetTrackNumber() != tp.m_track)
-        {
-            ++index;
-            continue;
-        }
-
-        const long long tc_ = pBlock->GetTimeCode(this);
-
-        if (tc_ < tc)
-        {
-            ++index;
-            continue;
-        }
-
-        if (tc_ > tc)
-            return NULL;
-
-        const Tracks* const pTracks = m_pSegment->GetTracks();
-        assert(pTracks);
-
-        const long tn = static_cast<long>(tp.m_track);
-        const Track* const pTrack = pTracks->GetTrackByNumber(tn);
-
-        if (pTrack == NULL)
-            return NULL;
-
-        const long long type = pTrack->GetType();
-
-        if (type == 2)  //audio
-            return pEntry;
-
-        if (type != 1)  //not video
-            return NULL;
-
-        if (!pBlock->IsKey())
-            return NULL;
-
-        return pEntry;
+    if ((pBlock->GetTrackNumber() == tp.m_track) &&
+        (pBlock->GetTimeCode(this) == tc)) {
+      return pEntry;
     }
+  }
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != tp.m_track) {
+      ++index;
+      continue;
+    }
+
+    const long long tc_ = pBlock->GetTimeCode(this);
+
+    if (tc_ < tc) {
+      ++index;
+      continue;
+    }
+
+    if (tc_ > tc)
+      return NULL;
+
+    const Tracks* const pTracks = m_pSegment->GetTracks();
+    assert(pTracks);
+
+    const long tn = static_cast<long>(tp.m_track);
+    const Track* const pTrack = pTracks->GetTrackByNumber(tn);
+
+    if (pTrack == NULL)
+      return NULL;
+
+    const long long type = pTrack->GetType();
+
+    if (type == 2)  // audio
+      return pEntry;
+
+    if (type != 1)  // not video
+      return NULL;
+
+    if (!pBlock->IsKey())
+      return NULL;
+
+    return pEntry;
+  }
 
 #endif
-
 }
 
-
 #if 0
 const BlockEntry* Cluster::GetMaxKey(const VideoTrack* pTrack) const
 {
@@ -9015,97 +7875,46 @@
 }
 #endif
 
+BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
 
-BlockEntry::BlockEntry(Cluster* p, long idx) :
-    m_pCluster(p),
-    m_index(idx)
-{
+BlockEntry::~BlockEntry() {}
+
+bool BlockEntry::EOS() const { return (GetKind() == kBlockEOS); }
+
+const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
+
+long BlockEntry::GetIndex() const { return m_index; }
+
+SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
+                         long long size)
+    : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
+
+long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
+
+BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
+
+const Block* SimpleBlock::GetBlock() const { return &m_block; }
+
+BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
+                       long long block_size, long long prev, long long next,
+                       long long duration, long long discard_padding)
+    : BlockEntry(pCluster, idx),
+      m_block(block_start, block_size, discard_padding),
+      m_prev(prev),
+      m_next(next),
+      m_duration(duration) {}
+
+long BlockGroup::Parse() {
+  const long status = m_block.Parse(m_pCluster);
+
+  if (status)
+    return status;
+
+  m_block.SetKey((m_prev > 0) && (m_next <= 0));
+
+  return 0;
 }
 
-
-BlockEntry::~BlockEntry()
-{
-}
-
-
-bool BlockEntry::EOS() const
-{
-    return (GetKind() == kBlockEOS);
-}
-
-
-const Cluster* BlockEntry::GetCluster() const
-{
-    return m_pCluster;
-}
-
-
-long BlockEntry::GetIndex() const
-{
-    return m_index;
-}
-
-
-SimpleBlock::SimpleBlock(
-    Cluster* pCluster,
-    long idx,
-    long long start,
-    long long size) :
-    BlockEntry(pCluster, idx),
-    m_block(start, size, 0)
-{
-}
-
-
-long SimpleBlock::Parse()
-{
-    return m_block.Parse(m_pCluster);
-}
-
-
-BlockEntry::Kind SimpleBlock::GetKind() const
-{
-    return kBlockSimple;
-}
-
-
-const Block* SimpleBlock::GetBlock() const
-{
-    return &m_block;
-}
-
-
-BlockGroup::BlockGroup(
-    Cluster* pCluster,
-    long idx,
-    long long block_start,
-    long long block_size,
-    long long prev,
-    long long next,
-    long long duration,
-    long long discard_padding) :
-    BlockEntry(pCluster, idx),
-    m_block(block_start, block_size, discard_padding),
-    m_prev(prev),
-    m_next(next),
-    m_duration(duration)
-{
-}
-
-
-long BlockGroup::Parse()
-{
-    const long status = m_block.Parse(m_pCluster);
-
-    if (status)
-        return status;
-
-    m_block.SetKey((m_prev > 0) && (m_next <= 0));
-
-    return 0;
-}
-
-
 #if 0
 void BlockGroup::ParseBlock(long long start, long long size)
 {
@@ -9122,496 +7931,428 @@
 }
 #endif
 
+BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
 
-BlockEntry::Kind BlockGroup::GetKind() const
-{
-    return kBlockGroup;
-}
+const Block* BlockGroup::GetBlock() const { return &m_block; }
 
+long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
 
-const Block* BlockGroup::GetBlock() const
-{
-    return &m_block;
-}
+long long BlockGroup::GetNextTimeCode() const { return m_next; }
 
+long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
 
-long long BlockGroup::GetPrevTimeCode() const
-{
-    return m_prev;
-}
+Block::Block(long long start, long long size_, long long discard_padding)
+    : m_start(start),
+      m_size(size_),
+      m_track(0),
+      m_timecode(-1),
+      m_flags(0),
+      m_frames(NULL),
+      m_frame_count(-1),
+      m_discard_padding(discard_padding) {}
 
+Block::~Block() { delete[] m_frames; }
 
-long long BlockGroup::GetNextTimeCode() const
-{
-    return m_next;
-}
+long Block::Parse(const Cluster* pCluster) {
+  if (pCluster == NULL)
+    return -1;
 
-long long BlockGroup::GetDurationTimeCode() const
-{
-    return m_duration;
-}
+  if (pCluster->m_pSegment == NULL)
+    return -1;
 
-Block::Block(long long start, long long size_, long long discard_padding) :
-    m_start(start),
-    m_size(size_),
-    m_track(0),
-    m_timecode(-1),
-    m_flags(0),
-    m_frames(NULL),
-    m_frame_count(-1),
-    m_discard_padding(discard_padding)
-{
-}
+  assert(m_start >= 0);
+  assert(m_size >= 0);
+  assert(m_track <= 0);
+  assert(m_frames == NULL);
+  assert(m_frame_count <= 0);
 
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
 
-Block::~Block()
-{
-    delete[] m_frames;
-}
+  long len;
 
+  IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
 
-long Block::Parse(const Cluster* pCluster)
-{
-    if (pCluster == NULL)
-        return -1;
+  m_track = ReadUInt(pReader, pos, len);
 
-    if (pCluster->m_pSegment == NULL)
-        return -1;
+  if (m_track <= 0)
+    return E_FILE_FORMAT_INVALID;
 
-    assert(m_start >= 0);
-    assert(m_size >= 0);
-    assert(m_track <= 0);
-    assert(m_frames == NULL);
-    assert(m_frame_count <= 0);
+  if ((pos + len) > stop)
+    return E_FILE_FORMAT_INVALID;
 
-    long long pos = m_start;
-    const long long stop = m_start + m_size;
+  pos += len;  // consume track number
 
-    long len;
+  if ((stop - pos) < 2)
+    return E_FILE_FORMAT_INVALID;
 
-    IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
+  long status;
+  long long value;
 
-    m_track = ReadUInt(pReader, pos, len);
+  status = UnserializeInt(pReader, pos, 2, value);
 
-    if (m_track <= 0)
-        return E_FILE_FORMAT_INVALID;
+  if (status)
+    return E_FILE_FORMAT_INVALID;
 
-    if ((pos + len) > stop)
-        return E_FILE_FORMAT_INVALID;
+  if (value < SHRT_MIN)
+    return E_FILE_FORMAT_INVALID;
 
-    pos += len;  //consume track number
+  if (value > SHRT_MAX)
+    return E_FILE_FORMAT_INVALID;
 
-    if ((stop - pos) < 2)
-        return E_FILE_FORMAT_INVALID;
+  m_timecode = static_cast<short>(value);
 
-    long status;
-    long long value;
+  pos += 2;
 
-    status = UnserializeInt(pReader, pos, 2, value);
+  if ((stop - pos) <= 0)
+    return E_FILE_FORMAT_INVALID;
 
-    if (status)
-        return E_FILE_FORMAT_INVALID;
+  status = pReader->Read(pos, 1, &m_flags);
 
-    if (value < SHRT_MIN)
-        return E_FILE_FORMAT_INVALID;
+  if (status)
+    return E_FILE_FORMAT_INVALID;
 
-    if (value > SHRT_MAX)
-        return E_FILE_FORMAT_INVALID;
+  const int lacing = int(m_flags & 0x06) >> 1;
 
-    m_timecode = static_cast<short>(value);
+  ++pos;  // consume flags byte
 
-    pos += 2;
+  if (lacing == 0) {  // no lacing
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
 
-    if ((stop - pos) <= 0)
-        return E_FILE_FORMAT_INVALID;
-
-    status = pReader->Read(pos, 1, &m_flags);
-
-    if (status)
-        return E_FILE_FORMAT_INVALID;
-
-    const int lacing = int(m_flags & 0x06) >> 1;
-
-    ++pos;  //consume flags byte
-
-    if (lacing == 0)  //no lacing
-    {
-        if (pos > stop)
-            return E_FILE_FORMAT_INVALID;
-
-        m_frame_count = 1;
-        m_frames = new Frame[m_frame_count];
-
-        Frame& f = m_frames[0];
-        f.pos = pos;
-
-        const long long frame_size = stop - pos;
-
-        if (frame_size > LONG_MAX)
-            return E_FILE_FORMAT_INVALID;
-
-        f.len = static_cast<long>(frame_size);
-
-        return 0;  //success
-    }
-
-    if (pos >= stop)
-        return E_FILE_FORMAT_INVALID;
-
-    unsigned char biased_count;
-
-    status = pReader->Read(pos, 1, &biased_count);
-
-    if (status)
-        return E_FILE_FORMAT_INVALID;
-
-    ++pos;  //consume frame count
-    assert(pos <= stop);
-
-    m_frame_count = int(biased_count) + 1;
-
+    m_frame_count = 1;
     m_frames = new Frame[m_frame_count];
-    assert(m_frames);
 
-    if (lacing == 1)  //Xiph
-    {
-        Frame* pf = m_frames;
-        Frame* const pf_end = pf + m_frame_count;
+    Frame& f = m_frames[0];
+    f.pos = pos;
 
-        long size = 0;
-        int frame_count = m_frame_count;
+    const long long frame_size = stop - pos;
 
-        while (frame_count > 1)
-        {
-            long frame_size = 0;
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
 
-            for (;;)
-            {
-                unsigned char val;
+    f.len = static_cast<long>(frame_size);
 
-                if (pos >= stop)
-                    return E_FILE_FORMAT_INVALID;
+    return 0;  // success
+  }
 
-                status = pReader->Read(pos, 1, &val);
+  if (pos >= stop)
+    return E_FILE_FORMAT_INVALID;
 
-                if (status)
-                    return E_FILE_FORMAT_INVALID;
+  unsigned char biased_count;
 
-                ++pos;  //consume xiph size byte
+  status = pReader->Read(pos, 1, &biased_count);
 
-                frame_size += val;
+  if (status)
+    return E_FILE_FORMAT_INVALID;
 
-                if (val < 255)
-                    break;
-            }
+  ++pos;  // consume frame count
+  assert(pos <= stop);
 
-            Frame& f = *pf++;
-            assert(pf < pf_end);
+  m_frame_count = int(biased_count) + 1;
 
-            f.pos = 0;  //patch later
+  m_frames = new Frame[m_frame_count];
+  assert(m_frames);
 
-            f.len = frame_size;
-            size += frame_size;  //contribution of this frame
+  if (lacing == 1) {  // Xiph
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
 
-            --frame_count;
-        }
+    long size = 0;
+    int frame_count = m_frame_count;
 
-        assert(pf < pf_end);
-        assert(pos <= stop);
+    while (frame_count > 1) {
+      long frame_size = 0;
 
-        {
-            Frame& f = *pf++;
-
-            if (pf != pf_end)
-                return E_FILE_FORMAT_INVALID;
-
-            f.pos = 0;  //patch later
-
-            const long long total_size = stop - pos;
-
-            if (total_size < size)
-                return E_FILE_FORMAT_INVALID;
-
-            const long long frame_size = total_size - size;
-
-            if (frame_size > LONG_MAX)
-                return E_FILE_FORMAT_INVALID;
-
-            f.len = static_cast<long>(frame_size);
-        }
-
-        pf = m_frames;
-        while (pf != pf_end)
-        {
-            Frame& f = *pf++;
-            assert((pos + f.len) <= stop);
-
-            f.pos = pos;
-            pos += f.len;
-        }
-
-        assert(pos == stop);
-    }
-    else if (lacing == 2)  //fixed-size lacing
-    {
-        const long long total_size = stop - pos;
-
-        if ((total_size % m_frame_count) != 0)
-            return E_FILE_FORMAT_INVALID;
-
-        const long long frame_size = total_size / m_frame_count;
-
-        if (frame_size > LONG_MAX)
-            return E_FILE_FORMAT_INVALID;
-
-        Frame* pf = m_frames;
-        Frame* const pf_end = pf + m_frame_count;
-
-        while (pf != pf_end)
-        {
-            assert((pos + frame_size) <= stop);
-
-            Frame& f = *pf++;
-
-            f.pos = pos;
-            f.len = static_cast<long>(frame_size);
-
-            pos += frame_size;
-        }
-
-        assert(pos == stop);
-    }
-    else
-    {
-        assert(lacing == 3);  //EBML lacing
+      for (;;) {
+        unsigned char val;
 
         if (pos >= stop)
-            return E_FILE_FORMAT_INVALID;
+          return E_FILE_FORMAT_INVALID;
 
-        long size = 0;
-        int frame_count = m_frame_count;
+        status = pReader->Read(pos, 1, &val);
 
-        long long frame_size = ReadUInt(pReader, pos, len);
+        if (status)
+          return E_FILE_FORMAT_INVALID;
 
-        if (frame_size < 0)
-            return E_FILE_FORMAT_INVALID;
+        ++pos;  // consume xiph size byte
 
-        if (frame_size > LONG_MAX)
-            return E_FILE_FORMAT_INVALID;
+        frame_size += val;
 
-        if ((pos + len) > stop)
-            return E_FILE_FORMAT_INVALID;
+        if (val < 255)
+          break;
+      }
 
-        pos += len; //consume length of size of first frame
+      Frame& f = *pf++;
+      assert(pf < pf_end);
 
-        if ((pos + frame_size) > stop)
-            return E_FILE_FORMAT_INVALID;
+      f.pos = 0;  // patch later
 
-        Frame* pf = m_frames;
-        Frame* const pf_end = pf + m_frame_count;
+      f.len = frame_size;
+      size += frame_size;  // contribution of this frame
 
-        {
-            Frame& curr = *pf;
-
-            curr.pos = 0;  //patch later
-
-            curr.len = static_cast<long>(frame_size);
-            size += curr.len;  //contribution of this frame
-        }
-
-        --frame_count;
-
-        while (frame_count > 1)
-        {
-            if (pos >= stop)
-                return E_FILE_FORMAT_INVALID;
-
-            assert(pf < pf_end);
-
-            const Frame& prev = *pf++;
-            assert(prev.len == frame_size);
-            if (prev.len != frame_size)
-                return E_FILE_FORMAT_INVALID;
-
-            assert(pf < pf_end);
-
-            Frame& curr = *pf;
-
-            curr.pos = 0;  //patch later
-
-            const long long delta_size_ = ReadUInt(pReader, pos, len);
-
-            if (delta_size_ < 0)
-                return E_FILE_FORMAT_INVALID;
-
-            if ((pos + len) > stop)
-                return E_FILE_FORMAT_INVALID;
-
-            pos += len;  //consume length of (delta) size
-            assert(pos <= stop);
-
-            const int exp = 7*len - 1;
-            const long long bias = (1LL << exp) - 1LL;
-            const long long delta_size = delta_size_ - bias;
-
-            frame_size += delta_size;
-
-            if (frame_size < 0)
-                return E_FILE_FORMAT_INVALID;
-
-            if (frame_size > LONG_MAX)
-                return E_FILE_FORMAT_INVALID;
-
-            curr.len = static_cast<long>(frame_size);
-            size += curr.len;  //contribution of this frame
-
-            --frame_count;
-        }
-
-        {
-            assert(pos <= stop);
-            assert(pf < pf_end);
-
-            const Frame& prev = *pf++;
-            assert(prev.len == frame_size);
-            if (prev.len != frame_size)
-                return E_FILE_FORMAT_INVALID;
-
-            assert(pf < pf_end);
-
-            Frame& curr = *pf++;
-            assert(pf == pf_end);
-
-            curr.pos = 0;  //patch later
-
-            const long long total_size = stop - pos;
-
-            if (total_size < size)
-                return E_FILE_FORMAT_INVALID;
-
-            frame_size = total_size - size;
-
-            if (frame_size > LONG_MAX)
-                return E_FILE_FORMAT_INVALID;
-
-            curr.len = static_cast<long>(frame_size);
-        }
-
-        pf = m_frames;
-        while (pf != pf_end)
-        {
-            Frame& f = *pf++;
-            assert((pos + f.len) <= stop);
-
-            f.pos = pos;
-            pos += f.len;
-        }
-
-        assert(pos == stop);
+      --frame_count;
     }
 
-    return 0;  //success
+    assert(pf < pf_end);
+    assert(pos <= stop);
+
+    {
+      Frame& f = *pf++;
+
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    assert(pos == stop);
+  } else if (lacing == 2) {  // fixed-size lacing
+    const long long total_size = stop - pos;
+
+    if ((total_size % m_frame_count) != 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long frame_size = total_size / m_frame_count;
+
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    while (pf != pf_end) {
+      assert((pos + frame_size) <= stop);
+
+      Frame& f = *pf++;
+
+      f.pos = pos;
+      f.len = static_cast<long>(frame_size);
+
+      pos += frame_size;
+    }
+
+    assert(pos == stop);
+  } else {
+    assert(lacing == 3);  // EBML lacing
+
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    long size = 0;
+    int frame_count = m_frame_count;
+
+    long long frame_size = ReadUInt(pReader, pos, len);
+
+    if (frame_size < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume length of size of first frame
+
+    if ((pos + frame_size) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    {
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+    }
+
+    --frame_count;
+
+    while (frame_count > 1) {
+      if (pos >= stop)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      const long long delta_size_ = ReadUInt(pReader, pos, len);
+
+      if (delta_size_ < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += len;  // consume length of (delta) size
+      assert(pos <= stop);
+
+      const int exp = 7 * len - 1;
+      const long long bias = (1LL << exp) - 1LL;
+      const long long delta_size = delta_size_ - bias;
+
+      frame_size += delta_size;
+
+      if (frame_size < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    {
+      assert(pos <= stop);
+      assert(pf < pf_end);
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+
+      Frame& curr = *pf++;
+      assert(pf == pf_end);
+
+      curr.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    assert(pos == stop);
+  }
+
+  return 0;  // success
 }
 
+long long Block::GetTimeCode(const Cluster* pCluster) const {
+  if (pCluster == 0)
+    return m_timecode;
 
-long long Block::GetTimeCode(const Cluster* pCluster) const
-{
-    if (pCluster == 0)
-        return m_timecode;
+  const long long tc0 = pCluster->GetTimeCode();
+  assert(tc0 >= 0);
 
-    const long long tc0 = pCluster->GetTimeCode();
-    assert(tc0 >= 0);
+  const long long tc = tc0 + m_timecode;
 
-    const long long tc = tc0 + m_timecode;
-
-    return tc;  //unscaled timecode units
+  return tc;  // unscaled timecode units
 }
 
+long long Block::GetTime(const Cluster* pCluster) const {
+  assert(pCluster);
 
-long long Block::GetTime(const Cluster* pCluster) const
-{
-    assert(pCluster);
+  const long long tc = GetTimeCode(pCluster);
 
-    const long long tc = GetTimeCode(pCluster);
+  const Segment* const pSegment = pCluster->m_pSegment;
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
 
-    const Segment* const pSegment = pCluster->m_pSegment;
-    const SegmentInfo* const pInfo = pSegment->GetInfo();
-    assert(pInfo);
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
 
-    const long long scale = pInfo->GetTimeCodeScale();
-    assert(scale >= 1);
+  const long long ns = tc * scale;
 
-    const long long ns = tc * scale;
-
-    return ns;
+  return ns;
 }
 
+long long Block::GetTrackNumber() const { return m_track; }
 
-long long Block::GetTrackNumber() const
-{
-    return m_track;
+bool Block::IsKey() const {
+  return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
 }
 
-
-bool Block::IsKey() const
-{
-    return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
+void Block::SetKey(bool bKey) {
+  if (bKey)
+    m_flags |= static_cast<unsigned char>(1 << 7);
+  else
+    m_flags &= 0x7F;
 }
 
+bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); }
 
-void Block::SetKey(bool bKey)
-{
-    if (bKey)
-        m_flags |= static_cast<unsigned char>(1 << 7);
-    else
-        m_flags &= 0x7F;
+Block::Lacing Block::GetLacing() const {
+  const int value = int(m_flags & 0x06) >> 1;
+  return static_cast<Lacing>(value);
 }
 
+int Block::GetFrameCount() const { return m_frame_count; }
 
-bool Block::IsInvisible() const
-{
-    return bool(int(m_flags & 0x08) != 0);
+const Block::Frame& Block::GetFrame(int idx) const {
+  assert(idx >= 0);
+  assert(idx < m_frame_count);
+
+  const Frame& f = m_frames[idx];
+  assert(f.pos > 0);
+  assert(f.len > 0);
+
+  return f;
 }
 
+long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
+  assert(pReader);
+  assert(buf);
 
-Block::Lacing Block::GetLacing() const
-{
-    const int value = int(m_flags & 0x06) >> 1;
-    return static_cast<Lacing>(value);
+  const long status = pReader->Read(pos, len, buf);
+  return status;
 }
 
+long long Block::GetDiscardPadding() const { return m_discard_padding; }
 
-int Block::GetFrameCount() const
-{
-    return m_frame_count;
-}
-
-
-const Block::Frame& Block::GetFrame(int idx) const
-{
-    assert(idx >= 0);
-    assert(idx < m_frame_count);
-
-    const Frame& f = m_frames[idx];
-    assert(f.pos > 0);
-    assert(f.len > 0);
-
-    return f;
-}
-
-
-long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const
-{
-    assert(pReader);
-    assert(buf);
-
-    const long status = pReader->Read(pos, len, buf);
-    return status;
-}
-
-long long Block::GetDiscardPadding() const
-{
-    return m_discard_padding;
-}
-
-}  //end namespace mkvparser
+}  // end namespace mkvparser
diff --git a/third_party/libwebm/mkvparser.hpp b/third_party/libwebm/mkvparser.hpp
index 7184d26..3e17d07 100644
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser.hpp
@@ -13,19 +13,18 @@
 #include <cstdio>
 #include <cstddef>
 
-namespace mkvparser
-{
+namespace mkvparser {
 
 const int E_FILE_FORMAT_INVALID = -2;
 const int E_BUFFER_NOT_FULL = -3;
 
-class IMkvReader
-{
-public:
-    virtual int Read(long long pos, long len, unsigned char* buf) = 0;
-    virtual int Length(long long* total, long long* available) = 0;
-protected:
-    virtual ~IMkvReader();
+class IMkvReader {
+ public:
+  virtual int Read(long long pos, long len, unsigned char* buf) = 0;
+  virtual int Length(long long* total, long long* available) = 0;
+
+ protected:
+  virtual ~IMkvReader();
 };
 
 long long GetUIntLength(IMkvReader*, long long, long&);
@@ -35,170 +34,148 @@
 long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
 long UnserializeInt(IMkvReader*, long long pos, long len, long long& result);
 
-long UnserializeString(
-        IMkvReader*,
-        long long pos,
-        long long size,
-        char*& str);
+long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
 
-long ParseElementHeader(
-    IMkvReader* pReader,
-    long long& pos,  //consume id and size fields
-    long long stop,  //if you know size of element's parent
-    long long& id,
-    long long& size);
+long ParseElementHeader(IMkvReader* pReader,
+                        long long& pos,  // consume id and size fields
+                        long long stop,  // if you know size of element's parent
+                        long long& id, long long& size);
 
 bool Match(IMkvReader*, long long&, unsigned long, long long&);
 bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&);
 
 void GetVersion(int& major, int& minor, int& build, int& revision);
 
-struct EBMLHeader
-{
-    EBMLHeader();
-    ~EBMLHeader();
-    long long m_version;
-    long long m_readVersion;
-    long long m_maxIdLength;
-    long long m_maxSizeLength;
-    char* m_docType;
-    long long m_docTypeVersion;
-    long long m_docTypeReadVersion;
+struct EBMLHeader {
+  EBMLHeader();
+  ~EBMLHeader();
+  long long m_version;
+  long long m_readVersion;
+  long long m_maxIdLength;
+  long long m_maxSizeLength;
+  char* m_docType;
+  long long m_docTypeVersion;
+  long long m_docTypeReadVersion;
 
-    long long Parse(IMkvReader*, long long&);
-    void Init();
+  long long Parse(IMkvReader*, long long&);
+  void Init();
 };
 
-
 class Segment;
 class Track;
 class Cluster;
 
-class Block
-{
-    Block(const Block&);
-    Block& operator=(const Block&);
+class Block {
+  Block(const Block&);
+  Block& operator=(const Block&);
 
-public:
-    const long long m_start;
-    const long long m_size;
+ public:
+  const long long m_start;
+  const long long m_size;
 
-    Block(long long start, long long size, long long discard_padding);
-    ~Block();
+  Block(long long start, long long size, long long discard_padding);
+  ~Block();
 
-    long Parse(const Cluster*);
+  long Parse(const Cluster*);
 
-    long long GetTrackNumber() const;
-    long long GetTimeCode(const Cluster*) const;  //absolute, but not scaled
-    long long GetTime(const Cluster*) const;      //absolute, and scaled (ns)
-    bool IsKey() const;
-    void SetKey(bool);
-    bool IsInvisible() const;
+  long long GetTrackNumber() const;
+  long long GetTimeCode(const Cluster*) const;  // absolute, but not scaled
+  long long GetTime(const Cluster*) const;  // absolute, and scaled (ns)
+  bool IsKey() const;
+  void SetKey(bool);
+  bool IsInvisible() const;
 
-    enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
-    Lacing GetLacing() const;
+  enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
+  Lacing GetLacing() const;
 
-    int GetFrameCount() const;  //to index frames: [0, count)
+  int GetFrameCount() const;  // to index frames: [0, count)
 
-    struct Frame
-    {
-        long long pos;  //absolute offset
-        long len;
+  struct Frame {
+    long long pos;  // absolute offset
+    long len;
 
-        long Read(IMkvReader*, unsigned char*) const;
-    };
+    long Read(IMkvReader*, unsigned char*) const;
+  };
 
-    const Frame& GetFrame(int frame_index) const;
+  const Frame& GetFrame(int frame_index) const;
 
-    long long GetDiscardPadding() const;
+  long long GetDiscardPadding() const;
 
-private:
-    long long m_track;   //Track::Number()
-    short m_timecode;  //relative to cluster
-    unsigned char m_flags;
+ private:
+  long long m_track;  // Track::Number()
+  short m_timecode;  // relative to cluster
+  unsigned char m_flags;
 
-    Frame* m_frames;
-    int m_frame_count;
+  Frame* m_frames;
+  int m_frame_count;
 
-protected:
-    const long long m_discard_padding;
+ protected:
+  const long long m_discard_padding;
 };
 
+class BlockEntry {
+  BlockEntry(const BlockEntry&);
+  BlockEntry& operator=(const BlockEntry&);
 
-class BlockEntry
-{
-    BlockEntry(const BlockEntry&);
-    BlockEntry& operator=(const BlockEntry&);
+ protected:
+  BlockEntry(Cluster*, long index);
 
-protected:
-    BlockEntry(Cluster*, long index);
+ public:
+  virtual ~BlockEntry();
 
-public:
-    virtual ~BlockEntry();
+  bool EOS() const;
+  const Cluster* GetCluster() const;
+  long GetIndex() const;
+  virtual const Block* GetBlock() const = 0;
 
-    bool EOS() const;
-    const Cluster* GetCluster() const;
-    long GetIndex() const;
-    virtual const Block* GetBlock() const = 0;
+  enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
+  virtual Kind GetKind() const = 0;
 
-    enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
-    virtual Kind GetKind() const = 0;
-
-protected:
-    Cluster* const m_pCluster;
-    const long m_index;
-
+ protected:
+  Cluster* const m_pCluster;
+  const long m_index;
 };
 
+class SimpleBlock : public BlockEntry {
+  SimpleBlock(const SimpleBlock&);
+  SimpleBlock& operator=(const SimpleBlock&);
 
-class SimpleBlock : public BlockEntry
-{
-    SimpleBlock(const SimpleBlock&);
-    SimpleBlock& operator=(const SimpleBlock&);
+ public:
+  SimpleBlock(Cluster*, long index, long long start, long long size);
+  long Parse();
 
-public:
-    SimpleBlock(Cluster*, long index, long long start, long long size);
-    long Parse();
+  Kind GetKind() const;
+  const Block* GetBlock() const;
 
-    Kind GetKind() const;
-    const Block* GetBlock() const;
-
-protected:
-    Block m_block;
-
+ protected:
+  Block m_block;
 };
 
+class BlockGroup : public BlockEntry {
+  BlockGroup(const BlockGroup&);
+  BlockGroup& operator=(const BlockGroup&);
 
-class BlockGroup : public BlockEntry
-{
-    BlockGroup(const BlockGroup&);
-    BlockGroup& operator=(const BlockGroup&);
+ public:
+  BlockGroup(Cluster*, long index,
+             long long block_start,  // absolute pos of block's payload
+             long long block_size,  // size of block's payload
+             long long prev, long long next, long long duration,
+             long long discard_padding);
 
-public:
-    BlockGroup(
-        Cluster*,
-        long index,
-        long long block_start, //absolute pos of block's payload
-        long long block_size,  //size of block's payload
-        long long prev,
-        long long next,
-        long long duration,
-        long long discard_padding);
+  long Parse();
 
-    long Parse();
+  Kind GetKind() const;
+  const Block* GetBlock() const;
 
-    Kind GetKind() const;
-    const Block* GetBlock() const;
+  long long GetPrevTimeCode() const;  // relative to block's time
+  long long GetNextTimeCode() const;  // as above
+  long long GetDurationTimeCode() const;
 
-    long long GetPrevTimeCode() const;  //relative to block's time
-    long long GetNextTimeCode() const;  //as above
-    long long GetDurationTimeCode() const;
-
-private:
-    Block m_block;
-    const long long m_prev;
-    const long long m_next;
-    const long long m_duration;
+ private:
+  Block m_block;
+  const long long m_prev;
+  const long long m_next;
+  const long long m_duration;
 };
 
 ///////////////////////////////////////////////////////////////
@@ -206,635 +183,552 @@
 // Elements used to describe if the track data has been encrypted or
 // compressed with zlib or header stripping.
 class ContentEncoding {
-public:
-    enum {
-      kCTR = 1
-    };
+ public:
+  enum { kCTR = 1 };
 
-    ContentEncoding();
-    ~ContentEncoding();
+  ContentEncoding();
+  ~ContentEncoding();
 
-    // ContentCompression element names
-    struct ContentCompression {
-        ContentCompression();
-        ~ContentCompression();
+  // ContentCompression element names
+  struct ContentCompression {
+    ContentCompression();
+    ~ContentCompression();
 
-        unsigned long long algo;
-        unsigned char* settings;
-        long long settings_len;
-    };
+    unsigned long long algo;
+    unsigned char* settings;
+    long long settings_len;
+  };
 
-    // ContentEncAESSettings element names
-    struct ContentEncAESSettings {
-      ContentEncAESSettings() : cipher_mode(kCTR) {}
-      ~ContentEncAESSettings() {}
+  // ContentEncAESSettings element names
+  struct ContentEncAESSettings {
+    ContentEncAESSettings() : cipher_mode(kCTR) {}
+    ~ContentEncAESSettings() {}
 
-      unsigned long long cipher_mode;
-    };
+    unsigned long long cipher_mode;
+  };
 
-    // ContentEncryption element names
-    struct ContentEncryption {
-        ContentEncryption();
-        ~ContentEncryption();
+  // ContentEncryption element names
+  struct ContentEncryption {
+    ContentEncryption();
+    ~ContentEncryption();
 
-        unsigned long long algo;
-        unsigned char* key_id;
-        long long key_id_len;
-        unsigned char* signature;
-        long long signature_len;
-        unsigned char* sig_key_id;
-        long long sig_key_id_len;
-        unsigned long long sig_algo;
-        unsigned long long sig_hash_algo;
+    unsigned long long algo;
+    unsigned char* key_id;
+    long long key_id_len;
+    unsigned char* signature;
+    long long signature_len;
+    unsigned char* sig_key_id;
+    long long sig_key_id_len;
+    unsigned long long sig_algo;
+    unsigned long long sig_hash_algo;
 
-        ContentEncAESSettings aes_settings;
-    };
+    ContentEncAESSettings aes_settings;
+  };
 
-    // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
-    // is out of bounds.
-    const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
+  // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
 
-    // Returns number of ContentCompression elements in this ContentEncoding
-    // element.
-    unsigned long GetCompressionCount() const;
+  // Returns number of ContentCompression elements in this ContentEncoding
+  // element.
+  unsigned long GetCompressionCount() const;
 
-    // Parses the ContentCompression element from |pReader|. |start| is the
-    // starting offset of the ContentCompression payload. |size| is the size in
-    // bytes of the ContentCompression payload. |compression| is where the parsed
-    // values will be stored.
-    long ParseCompressionEntry(long long start,
-                               long long size,
-                               IMkvReader* pReader,
-                               ContentCompression* compression);
+  // Parses the ContentCompression element from |pReader|. |start| is the
+  // starting offset of the ContentCompression payload. |size| is the size in
+  // bytes of the ContentCompression payload. |compression| is where the parsed
+  // values will be stored.
+  long ParseCompressionEntry(long long start, long long size,
+                             IMkvReader* pReader,
+                             ContentCompression* compression);
 
-    // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
-    // is out of bounds.
-    const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
+  // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
 
-    // Returns number of ContentEncryption elements in this ContentEncoding
-    // element.
-    unsigned long GetEncryptionCount() const;
+  // Returns number of ContentEncryption elements in this ContentEncoding
+  // element.
+  unsigned long GetEncryptionCount() const;
 
-    // Parses the ContentEncAESSettings element from |pReader|. |start| is the
-    // starting offset of the ContentEncAESSettings payload. |size| is the
-    // size in bytes of the ContentEncAESSettings payload. |encryption| is
-    // where the parsed values will be stored.
-    long ParseContentEncAESSettingsEntry(long long start,
-                                         long long size,
-                                         IMkvReader* pReader,
-                                         ContentEncAESSettings* aes);
+  // Parses the ContentEncAESSettings element from |pReader|. |start| is the
+  // starting offset of the ContentEncAESSettings payload. |size| is the
+  // size in bytes of the ContentEncAESSettings payload. |encryption| is
+  // where the parsed values will be stored.
+  long ParseContentEncAESSettingsEntry(long long start, long long size,
+                                       IMkvReader* pReader,
+                                       ContentEncAESSettings* aes);
 
-    // Parses the ContentEncoding element from |pReader|. |start| is the
-    // starting offset of the ContentEncoding payload. |size| is the size in
-    // bytes of the ContentEncoding payload. Returns true on success.
-    long ParseContentEncodingEntry(long long start,
-                                   long long size,
-                                   IMkvReader* pReader);
+  // Parses the ContentEncoding element from |pReader|. |start| is the
+  // starting offset of the ContentEncoding payload. |size| is the size in
+  // bytes of the ContentEncoding payload. Returns true on success.
+  long ParseContentEncodingEntry(long long start, long long size,
+                                 IMkvReader* pReader);
 
-    // Parses the ContentEncryption element from |pReader|. |start| is the
-    // starting offset of the ContentEncryption payload. |size| is the size in
-    // bytes of the ContentEncryption payload. |encryption| is where the parsed
-    // values will be stored.
-    long ParseEncryptionEntry(long long start,
-                              long long size,
-                              IMkvReader* pReader,
-                              ContentEncryption* encryption);
+  // Parses the ContentEncryption element from |pReader|. |start| is the
+  // starting offset of the ContentEncryption payload. |size| is the size in
+  // bytes of the ContentEncryption payload. |encryption| is where the parsed
+  // values will be stored.
+  long ParseEncryptionEntry(long long start, long long size,
+                            IMkvReader* pReader, ContentEncryption* encryption);
 
-    unsigned long long encoding_order() const { return encoding_order_; }
-    unsigned long long encoding_scope() const { return encoding_scope_; }
-    unsigned long long encoding_type() const { return encoding_type_; }
+  unsigned long long encoding_order() const { return encoding_order_; }
+  unsigned long long encoding_scope() const { return encoding_scope_; }
+  unsigned long long encoding_type() const { return encoding_type_; }
 
-private:
-    // Member variables for list of ContentCompression elements.
-    ContentCompression** compression_entries_;
-    ContentCompression** compression_entries_end_;
+ private:
+  // Member variables for list of ContentCompression elements.
+  ContentCompression** compression_entries_;
+  ContentCompression** compression_entries_end_;
 
-    // Member variables for list of ContentEncryption elements.
-    ContentEncryption** encryption_entries_;
-    ContentEncryption** encryption_entries_end_;
+  // Member variables for list of ContentEncryption elements.
+  ContentEncryption** encryption_entries_;
+  ContentEncryption** encryption_entries_end_;
 
-    // ContentEncoding element names
-    unsigned long long encoding_order_;
-    unsigned long long encoding_scope_;
-    unsigned long long encoding_type_;
+  // ContentEncoding element names
+  unsigned long long encoding_order_;
+  unsigned long long encoding_scope_;
+  unsigned long long encoding_type_;
 
-    // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
-    ContentEncoding(const ContentEncoding&);
-    ContentEncoding& operator=(const ContentEncoding&);
+  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+  ContentEncoding(const ContentEncoding&);
+  ContentEncoding& operator=(const ContentEncoding&);
 };
 
-class Track
-{
-    Track(const Track&);
-    Track& operator=(const Track&);
+class Track {
+  Track(const Track&);
+  Track& operator=(const Track&);
 
-public:
-    class Info;
-    static long Create(
-        Segment*,
-        const Info&,
-        long long element_start,
-        long long element_size,
-        Track*&);
+ public:
+  class Info;
+  static long Create(Segment*, const Info&, long long element_start,
+                     long long element_size, Track*&);
 
-    enum Type {
-        kVideo = 1,
-        kAudio = 2,
-        kSubtitle = 0x11,
-        kMetadata = 0x21
-     };
+  enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 };
 
-    Segment* const m_pSegment;
-    const long long m_element_start;
-    const long long m_element_size;
-    virtual ~Track();
+  Segment* const m_pSegment;
+  const long long m_element_start;
+  const long long m_element_size;
+  virtual ~Track();
 
-    long GetType() const;
-    long GetNumber() const;
-    unsigned long long GetUid() const;
-    const char* GetNameAsUTF8() const;
+  long GetType() const;
+  long GetNumber() const;
+  unsigned long long GetUid() const;
+  const char* GetNameAsUTF8() const;
+  const char* GetLanguage() const;
+  const char* GetCodecNameAsUTF8() const;
+  const char* GetCodecId() const;
+  const unsigned char* GetCodecPrivate(size_t&) const;
+  bool GetLacing() const;
+  unsigned long long GetDefaultDuration() const;
+  unsigned long long GetCodecDelay() const;
+  unsigned long long GetSeekPreRoll() const;
+
+  const BlockEntry* GetEOS() const;
+
+  struct Settings {
+    long long start;
+    long long size;
+  };
+
+  class Info {
+   public:
+    Info();
+    ~Info();
+    int Copy(Info&) const;
+    void Clear();
+    long type;
+    long number;
+    unsigned long long uid;
+    unsigned long long defaultDuration;
+    unsigned long long codecDelay;
+    unsigned long long seekPreRoll;
+    char* nameAsUTF8;
+    char* language;
+    char* codecId;
+    char* codecNameAsUTF8;
+    unsigned char* codecPrivate;
+    size_t codecPrivateSize;
+    bool lacing;
+    Settings settings;
+
+   private:
+    Info(const Info&);
+    Info& operator=(const Info&);
+    int CopyStr(char* Info::*str, Info&) const;
+  };
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
+  virtual bool VetEntry(const BlockEntry*) const;
+  virtual long Seek(long long time_ns, const BlockEntry*&) const;
+
+  const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
+  unsigned long GetContentEncodingCount() const;
+
+  long ParseContentEncodingsEntry(long long start, long long size);
+
+ protected:
+  Track(Segment*, long long element_start, long long element_size);
+
+  Info m_info;
+
+  class EOSBlock : public BlockEntry {
+   public:
+    EOSBlock();
+
+    Kind GetKind() const;
+    const Block* GetBlock() const;
+  };
+
+  EOSBlock m_eos;
+
+ private:
+  ContentEncoding** content_encoding_entries_;
+  ContentEncoding** content_encoding_entries_end_;
+};
+
+class VideoTrack : public Track {
+  VideoTrack(const VideoTrack&);
+  VideoTrack& operator=(const VideoTrack&);
+
+  VideoTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, VideoTrack*&);
+
+  long long GetWidth() const;
+  long long GetHeight() const;
+  double GetFrameRate() const;
+
+  bool VetEntry(const BlockEntry*) const;
+  long Seek(long long time_ns, const BlockEntry*&) const;
+
+ private:
+  long long m_width;
+  long long m_height;
+  double m_rate;
+};
+
+class AudioTrack : public Track {
+  AudioTrack(const AudioTrack&);
+  AudioTrack& operator=(const AudioTrack&);
+
+  AudioTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, AudioTrack*&);
+
+  double GetSamplingRate() const;
+  long long GetChannels() const;
+  long long GetBitDepth() const;
+
+ private:
+  double m_rate;
+  long long m_channels;
+  long long m_bitDepth;
+};
+
+class Tracks {
+  Tracks(const Tracks&);
+  Tracks& operator=(const Tracks&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tracks(Segment*, long long start, long long size, long long element_start,
+         long long element_size);
+
+  ~Tracks();
+
+  long Parse();
+
+  unsigned long GetTracksCount() const;
+
+  const Track* GetTrackByNumber(long tn) const;
+  const Track* GetTrackByIndex(unsigned long idx) const;
+
+ private:
+  Track** m_trackEntries;
+  Track** m_trackEntriesEnd;
+
+  long ParseTrackEntry(long long payload_start, long long payload_size,
+                       long long element_start, long long element_size,
+                       Track*&) const;
+};
+
+class Chapters {
+  Chapters(const Chapters&);
+  Chapters& operator=(const Chapters&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Chapters(Segment*, long long payload_start, long long payload_size,
+           long long element_start, long long element_size);
+
+  ~Chapters();
+
+  long Parse();
+
+  class Atom;
+  class Edition;
+
+  class Display {
+    friend class Atom;
+    Display();
+    Display(const Display&);
+    ~Display();
+    Display& operator=(const Display&);
+
+   public:
+    const char* GetString() const;
     const char* GetLanguage() const;
-    const char* GetCodecNameAsUTF8() const;
-    const char* GetCodecId() const;
-    const unsigned char* GetCodecPrivate(size_t&) const;
-    bool GetLacing() const;
-    unsigned long long GetDefaultDuration() const;
-    unsigned long long GetCodecDelay() const;
-    unsigned long long GetSeekPreRoll() const;
+    const char* GetCountry() const;
 
-    const BlockEntry* GetEOS() const;
+   private:
+    void Init();
+    void ShallowCopy(Display&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
 
-    struct Settings
-    {
-        long long start;
-        long long size;
-    };
+    char* m_string;
+    char* m_language;
+    char* m_country;
+  };
 
-    class Info
-    {
-    public:
-        Info();
-        ~Info();
-        int Copy(Info&) const;
-        void Clear();
-        long type;
-        long number;
-        unsigned long long uid;
-        unsigned long long defaultDuration;
-        unsigned long long codecDelay;
-        unsigned long long seekPreRoll;
-        char* nameAsUTF8;
-        char* language;
-        char* codecId;
-        char* codecNameAsUTF8;
-        unsigned char* codecPrivate;
-        size_t codecPrivateSize;
-        bool lacing;
-        Settings settings;
+  class Atom {
+    friend class Edition;
+    Atom();
+    Atom(const Atom&);
+    ~Atom();
+    Atom& operator=(const Atom&);
 
-    private:
-        Info(const Info&);
-        Info& operator=(const Info&);
-        int CopyStr(char* Info::*str, Info&) const;
-    };
+   public:
+    unsigned long long GetUID() const;
+    const char* GetStringUID() const;
 
-    long GetFirst(const BlockEntry*&) const;
-    long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
-    virtual bool VetEntry(const BlockEntry*) const;
-    virtual long Seek(long long time_ns, const BlockEntry*&) const;
+    long long GetStartTimecode() const;
+    long long GetStopTimecode() const;
 
-    const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
-    unsigned long GetContentEncodingCount() const;
+    long long GetStartTime(const Chapters*) const;
+    long long GetStopTime(const Chapters*) const;
 
-    long ParseContentEncodingsEntry(long long start, long long size);
+    int GetDisplayCount() const;
+    const Display* GetDisplay(int index) const;
 
-protected:
-    Track(
-        Segment*,
-        long long element_start,
-        long long element_size);
+   private:
+    void Init();
+    void ShallowCopy(Atom&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+    static long long GetTime(const Chapters*, long long timecode);
 
-    Info m_info;
+    long ParseDisplay(IMkvReader*, long long pos, long long size);
+    bool ExpandDisplaysArray();
 
-    class EOSBlock : public BlockEntry
-    {
-    public:
-        EOSBlock();
+    char* m_string_uid;
+    unsigned long long m_uid;
+    long long m_start_timecode;
+    long long m_stop_timecode;
 
-        Kind GetKind() const;
-        const Block* GetBlock() const;
-    };
+    Display* m_displays;
+    int m_displays_size;
+    int m_displays_count;
+  };
 
-    EOSBlock m_eos;
+  class Edition {
+    friend class Chapters;
+    Edition();
+    Edition(const Edition&);
+    ~Edition();
+    Edition& operator=(const Edition&);
 
-private:
-    ContentEncoding** content_encoding_entries_;
-    ContentEncoding** content_encoding_entries_end_;
+   public:
+    int GetAtomCount() const;
+    const Atom* GetAtom(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Edition&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseAtom(IMkvReader*, long long pos, long long size);
+    bool ExpandAtomsArray();
+
+    Atom* m_atoms;
+    int m_atoms_size;
+    int m_atoms_count;
+  };
+
+  int GetEditionCount() const;
+  const Edition* GetEdition(int index) const;
+
+ private:
+  long ParseEdition(long long pos, long long size);
+  bool ExpandEditionsArray();
+
+  Edition* m_editions;
+  int m_editions_size;
+  int m_editions_count;
 };
 
+class SegmentInfo {
+  SegmentInfo(const SegmentInfo&);
+  SegmentInfo& operator=(const SegmentInfo&);
 
-class VideoTrack : public Track
-{
-    VideoTrack(const VideoTrack&);
-    VideoTrack& operator=(const VideoTrack&);
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
 
-    VideoTrack(
-        Segment*,
-        long long element_start,
-        long long element_size);
+  SegmentInfo(Segment*, long long start, long long size,
+              long long element_start, long long element_size);
 
-public:
-    static long Parse(
-        Segment*,
-        const Info&,
-        long long element_start,
-        long long element_size,
-        VideoTrack*&);
+  ~SegmentInfo();
 
-    long long GetWidth() const;
-    long long GetHeight() const;
-    double GetFrameRate() const;
+  long Parse();
 
-    bool VetEntry(const BlockEntry*) const;
-    long Seek(long long time_ns, const BlockEntry*&) const;
+  long long GetTimeCodeScale() const;
+  long long GetDuration() const;  // scaled
+  const char* GetMuxingAppAsUTF8() const;
+  const char* GetWritingAppAsUTF8() const;
+  const char* GetTitleAsUTF8() const;
 
-private:
-    long long m_width;
-    long long m_height;
-    double m_rate;
-
+ private:
+  long long m_timecodeScale;
+  double m_duration;
+  char* m_pMuxingAppAsUTF8;
+  char* m_pWritingAppAsUTF8;
+  char* m_pTitleAsUTF8;
 };
 
+class SeekHead {
+  SeekHead(const SeekHead&);
+  SeekHead& operator=(const SeekHead&);
 
-class AudioTrack : public Track
-{
-    AudioTrack(const AudioTrack&);
-    AudioTrack& operator=(const AudioTrack&);
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
 
-    AudioTrack(
-        Segment*,
-        long long element_start,
-        long long element_size);
-public:
-    static long Parse(
-        Segment*,
-        const Info&,
-        long long element_start,
-        long long element_size,
-        AudioTrack*&);
+  SeekHead(Segment*, long long start, long long size, long long element_start,
+           long long element_size);
 
-    double GetSamplingRate() const;
-    long long GetChannels() const;
-    long long GetBitDepth() const;
+  ~SeekHead();
 
-private:
-    double m_rate;
-    long long m_channels;
-    long long m_bitDepth;
-};
+  long Parse();
 
+  struct Entry {
+    // the SeekHead entry payload
+    long long id;
+    long long pos;
 
-class Tracks
-{
-    Tracks(const Tracks&);
-    Tracks& operator=(const Tracks&);
+    // absolute pos of SeekEntry ID
+    long long element_start;
 
-public:
-    Segment* const m_pSegment;
-    const long long m_start;
-    const long long m_size;
-    const long long m_element_start;
-    const long long m_element_size;
+    // SeekEntry ID size + size size + payload
+    long long element_size;
+  };
 
-    Tracks(
-        Segment*,
-        long long start,
-        long long size,
-        long long element_start,
-        long long element_size);
+  int GetCount() const;
+  const Entry* GetEntry(int idx) const;
 
-    ~Tracks();
+  struct VoidElement {
+    // absolute pos of Void ID
+    long long element_start;
 
-    long Parse();
+    // ID size + size size + payload size
+    long long element_size;
+  };
 
-    unsigned long GetTracksCount() const;
+  int GetVoidElementCount() const;
+  const VoidElement* GetVoidElement(int idx) const;
 
-    const Track* GetTrackByNumber(long tn) const;
-    const Track* GetTrackByIndex(unsigned long idx) const;
+ private:
+  Entry* m_entries;
+  int m_entry_count;
 
-private:
-    Track** m_trackEntries;
-    Track** m_trackEntriesEnd;
+  VoidElement* m_void_elements;
+  int m_void_element_count;
 
-    long ParseTrackEntry(
-        long long payload_start,
-        long long payload_size,
-        long long element_start,
-        long long element_size,
-        Track*&) const;
-
-};
-
-
-class Chapters
-{
-    Chapters(const Chapters&);
-    Chapters& operator=(const Chapters&);
-
-public:
-    Segment* const m_pSegment;
-    const long long m_start;
-    const long long m_size;
-    const long long m_element_start;
-    const long long m_element_size;
-
-    Chapters(
-        Segment*,
-        long long payload_start,
-        long long payload_size,
-        long long element_start,
-        long long element_size);
-
-    ~Chapters();
-
-    long Parse();
-
-    class Atom;
-    class Edition;
-
-    class Display
-    {
-        friend class Atom;
-        Display();
-        Display(const Display&);
-        ~Display();
-        Display& operator=(const Display&);
-    public:
-        const char* GetString() const;
-        const char* GetLanguage() const;
-        const char* GetCountry() const;
-    private:
-        void Init();
-        void ShallowCopy(Display&) const;
-        void Clear();
-        long Parse(IMkvReader*, long long pos, long long size);
-
-        char* m_string;
-        char* m_language;
-        char* m_country;
-    };
-
-    class Atom
-    {
-        friend class Edition;
-        Atom();
-        Atom(const Atom&);
-        ~Atom();
-        Atom& operator=(const Atom&);
-    public:
-        unsigned long long GetUID() const;
-        const char* GetStringUID() const;
-
-        long long GetStartTimecode() const;
-        long long GetStopTimecode() const;
-
-        long long GetStartTime(const Chapters*) const;
-        long long GetStopTime(const Chapters*) const;
-
-        int GetDisplayCount() const;
-        const Display* GetDisplay(int index) const;
-    private:
-        void Init();
-        void ShallowCopy(Atom&) const;
-        void Clear();
-        long Parse(IMkvReader*, long long pos, long long size);
-        static long long GetTime(const Chapters*, long long timecode);
-
-        long ParseDisplay(IMkvReader*, long long pos, long long size);
-        bool ExpandDisplaysArray();
-
-        char* m_string_uid;
-        unsigned long long m_uid;
-        long long m_start_timecode;
-        long long m_stop_timecode;
-
-        Display* m_displays;
-        int m_displays_size;
-        int m_displays_count;
-    };
-
-    class Edition
-    {
-        friend class Chapters;
-        Edition();
-        Edition(const Edition&);
-        ~Edition();
-        Edition& operator=(const Edition&);
-    public:
-        int GetAtomCount() const;
-        const Atom* GetAtom(int index) const;
-    private:
-        void Init();
-        void ShallowCopy(Edition&) const;
-        void Clear();
-        long Parse(IMkvReader*, long long pos, long long size);
-
-        long ParseAtom(IMkvReader*, long long pos, long long size);
-        bool ExpandAtomsArray();
-
-        Atom* m_atoms;
-        int m_atoms_size;
-        int m_atoms_count;
-    };
-
-    int GetEditionCount() const;
-    const Edition* GetEdition(int index) const;
-
-private:
-    long ParseEdition(long long pos, long long size);
-    bool ExpandEditionsArray();
-
-    Edition* m_editions;
-    int m_editions_size;
-    int m_editions_count;
-
-};
-
-
-class SegmentInfo
-{
-    SegmentInfo(const SegmentInfo&);
-    SegmentInfo& operator=(const SegmentInfo&);
-
-public:
-    Segment* const m_pSegment;
-    const long long m_start;
-    const long long m_size;
-    const long long m_element_start;
-    const long long m_element_size;
-
-    SegmentInfo(
-        Segment*,
-        long long start,
-        long long size,
-        long long element_start,
-        long long element_size);
-
-    ~SegmentInfo();
-
-    long Parse();
-
-    long long GetTimeCodeScale() const;
-    long long GetDuration() const;  //scaled
-    const char* GetMuxingAppAsUTF8() const;
-    const char* GetWritingAppAsUTF8() const;
-    const char* GetTitleAsUTF8() const;
-
-private:
-    long long m_timecodeScale;
-    double m_duration;
-    char* m_pMuxingAppAsUTF8;
-    char* m_pWritingAppAsUTF8;
-    char* m_pTitleAsUTF8;
-};
-
-
-class SeekHead
-{
-    SeekHead(const SeekHead&);
-    SeekHead& operator=(const SeekHead&);
-
-public:
-    Segment* const m_pSegment;
-    const long long m_start;
-    const long long m_size;
-    const long long m_element_start;
-    const long long m_element_size;
-
-    SeekHead(
-        Segment*,
-        long long start,
-        long long size,
-        long long element_start,
-        long long element_size);
-
-    ~SeekHead();
-
-    long Parse();
-
-    struct Entry
-    {
-        //the SeekHead entry payload
-        long long id;
-        long long pos;
-
-        //absolute pos of SeekEntry ID
-        long long element_start;
-
-        //SeekEntry ID size + size size + payload
-        long long element_size;
-    };
-
-    int GetCount() const;
-    const Entry* GetEntry(int idx) const;
-
-    struct VoidElement
-    {
-        //absolute pos of Void ID
-        long long element_start;
-
-        //ID size + size size + payload size
-        long long element_size;
-    };
-
-    int GetVoidElementCount() const;
-    const VoidElement* GetVoidElement(int idx) const;
-
-private:
-    Entry* m_entries;
-    int m_entry_count;
-
-    VoidElement* m_void_elements;
-    int m_void_element_count;
-
-    static bool ParseEntry(
-        IMkvReader*,
-        long long pos,  //payload
-        long long size,
-        Entry*);
-
+  static bool ParseEntry(IMkvReader*,
+                         long long pos,  // payload
+                         long long size, Entry*);
 };
 
 class Cues;
-class CuePoint
-{
-    friend class Cues;
+class CuePoint {
+  friend class Cues;
 
-    CuePoint(long, long long);
-    ~CuePoint();
+  CuePoint(long, long long);
+  ~CuePoint();
 
-    CuePoint(const CuePoint&);
-    CuePoint& operator=(const CuePoint&);
+  CuePoint(const CuePoint&);
+  CuePoint& operator=(const CuePoint&);
 
-public:
-    long long m_element_start;
-    long long m_element_size;
+ public:
+  long long m_element_start;
+  long long m_element_size;
 
-    void Load(IMkvReader*);
+  void Load(IMkvReader*);
 
-    long long GetTimeCode() const;      //absolute but unscaled
-    long long GetTime(const Segment*) const;  //absolute and scaled (ns units)
+  long long GetTimeCode() const;  // absolute but unscaled
+  long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
 
-    struct TrackPosition
-    {
-        long long m_track;
-        long long m_pos;  //of cluster
-        long long m_block;
-        //codec_state  //defaults to 0
-        //reference = clusters containing req'd referenced blocks
-        //  reftime = timecode of the referenced block
+  struct TrackPosition {
+    long long m_track;
+    long long m_pos;  // of cluster
+    long long m_block;
+    // codec_state  //defaults to 0
+    // reference = clusters containing req'd referenced blocks
+    //  reftime = timecode of the referenced block
 
-        void Parse(IMkvReader*, long long, long long);
-    };
+    void Parse(IMkvReader*, long long, long long);
+  };
 
-    const TrackPosition* Find(const Track*) const;
+  const TrackPosition* Find(const Track*) const;
 
-private:
-    const long m_index;
-    long long m_timecode;
-    TrackPosition* m_track_positions;
-    size_t m_track_positions_count;
-
+ private:
+  const long m_index;
+  long long m_timecode;
+  TrackPosition* m_track_positions;
+  size_t m_track_positions_count;
 };
 
+class Cues {
+  friend class Segment;
 
-class Cues
-{
-    friend class Segment;
+  Cues(Segment*, long long start, long long size, long long element_start,
+       long long element_size);
+  ~Cues();
 
-    Cues(
-        Segment*,
-        long long start,
-        long long size,
-        long long element_start,
-        long long element_size);
-    ~Cues();
+  Cues(const Cues&);
+  Cues& operator=(const Cues&);
 
-    Cues(const Cues&);
-    Cues& operator=(const Cues&);
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
 
-public:
-    Segment* const m_pSegment;
-    const long long m_start;
-    const long long m_size;
-    const long long m_element_start;
-    const long long m_element_size;
-
-    bool Find(  //lower bound of time_ns
-        long long time_ns,
-        const Track*,
-        const CuePoint*&,
-        const CuePoint::TrackPosition*&) const;
+  bool Find(  // lower bound of time_ns
+      long long time_ns, const Track*, const CuePoint*&,
+      const CuePoint::TrackPosition*&) const;
 
 #if 0
     bool FindNext(  //upper_bound of time_ns
@@ -844,165 +738,144 @@
         const CuePoint::TrackPosition*&) const;
 #endif
 
-    const CuePoint* GetFirst() const;
-    const CuePoint* GetLast() const;
-    const CuePoint* GetNext(const CuePoint*) const;
+  const CuePoint* GetFirst() const;
+  const CuePoint* GetLast() const;
+  const CuePoint* GetNext(const CuePoint*) const;
 
-    const BlockEntry* GetBlock(
-                        const CuePoint*,
-                        const CuePoint::TrackPosition*) const;
+  const BlockEntry* GetBlock(const CuePoint*,
+                             const CuePoint::TrackPosition*) const;
 
-    bool LoadCuePoint() const;
-    long GetCount() const;  //loaded only
-    //long GetTotal() const;  //loaded + preloaded
-    bool DoneParsing() const;
+  bool LoadCuePoint() const;
+  long GetCount() const;  // loaded only
+  // long GetTotal() const;  //loaded + preloaded
+  bool DoneParsing() const;
 
-private:
-    void Init() const;
-    void PreloadCuePoint(long&, long long) const;
+ private:
+  void Init() const;
+  void PreloadCuePoint(long&, long long) const;
 
-    mutable CuePoint** m_cue_points;
-    mutable long m_count;
-    mutable long m_preload_count;
-    mutable long long m_pos;
-
+  mutable CuePoint** m_cue_points;
+  mutable long m_count;
+  mutable long m_preload_count;
+  mutable long long m_pos;
 };
 
+class Cluster {
+  friend class Segment;
 
-class Cluster
-{
-    friend class Segment;
+  Cluster(const Cluster&);
+  Cluster& operator=(const Cluster&);
 
-    Cluster(const Cluster&);
-    Cluster& operator=(const Cluster&);
+ public:
+  Segment* const m_pSegment;
 
-public:
-    Segment* const m_pSegment;
+ public:
+  static Cluster* Create(Segment*,
+                         long index,  // index in segment
+                         long long off);  // offset relative to segment
+  // long long element_size);
 
-public:
-    static Cluster* Create(
-        Segment*,
-        long index,       //index in segment
-        long long off);   //offset relative to segment
-        //long long element_size);
+  Cluster();  // EndOfStream
+  ~Cluster();
 
-    Cluster();  //EndOfStream
-    ~Cluster();
+  bool EOS() const;
 
-    bool EOS() const;
+  long long GetTimeCode() const;  // absolute, but not scaled
+  long long GetTime() const;  // absolute, and scaled (nanosecond units)
+  long long GetFirstTime() const;  // time (ns) of first (earliest) block
+  long long GetLastTime() const;  // time (ns) of last (latest) block
 
-    long long GetTimeCode() const;   //absolute, but not scaled
-    long long GetTime() const;       //absolute, and scaled (nanosecond units)
-    long long GetFirstTime() const;  //time (ns) of first (earliest) block
-    long long GetLastTime() const;   //time (ns) of last (latest) block
+  long GetFirst(const BlockEntry*&) const;
+  long GetLast(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
 
-    long GetFirst(const BlockEntry*&) const;
-    long GetLast(const BlockEntry*&) const;
-    long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
+  const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
+  const BlockEntry* GetEntry(const CuePoint&,
+                             const CuePoint::TrackPosition&) const;
+  // const BlockEntry* GetMaxKey(const VideoTrack*) const;
 
-    const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
-    const BlockEntry* GetEntry(
-        const CuePoint&,
-        const CuePoint::TrackPosition&) const;
-    //const BlockEntry* GetMaxKey(const VideoTrack*) const;
+  //    static bool HasBlockEntries(const Segment*, long long);
 
-//    static bool HasBlockEntries(const Segment*, long long);
+  static long HasBlockEntries(const Segment*, long long idoff, long long& pos,
+                              long& size);
 
-    static long HasBlockEntries(
-            const Segment*,
-            long long idoff,
-            long long& pos,
-            long& size);
+  long GetEntryCount() const;
 
-    long GetEntryCount() const;
+  long Load(long long& pos, long& size) const;
 
-    long Load(long long& pos, long& size) const;
+  long Parse(long long& pos, long& size) const;
+  long GetEntry(long index, const mkvparser::BlockEntry*&) const;
 
-    long Parse(long long& pos, long& size) const;
-    long GetEntry(long index, const mkvparser::BlockEntry*&) const;
+ protected:
+  Cluster(Segment*, long index, long long element_start);
+  // long long element_size);
 
-protected:
-    Cluster(
-        Segment*,
-        long index,
-        long long element_start);
-        //long long element_size);
+ public:
+  const long long m_element_start;
+  long long GetPosition() const;  // offset relative to segment
 
-public:
-    const long long m_element_start;
-    long long GetPosition() const;  //offset relative to segment
+  long GetIndex() const;
+  long long GetElementSize() const;
+  // long long GetPayloadSize() const;
 
-    long GetIndex() const;
-    long long GetElementSize() const;
-    //long long GetPayloadSize() const;
+  // long long Unparsed() const;
 
-    //long long Unparsed() const;
+ private:
+  long m_index;
+  mutable long long m_pos;
+  // mutable long long m_size;
+  mutable long long m_element_size;
+  mutable long long m_timecode;
+  mutable BlockEntry** m_entries;
+  mutable long m_entries_size;
+  mutable long m_entries_count;
 
-private:
-    long m_index;
-    mutable long long m_pos;
-    //mutable long long m_size;
-    mutable long long m_element_size;
-    mutable long long m_timecode;
-    mutable BlockEntry** m_entries;
-    mutable long m_entries_size;
-    mutable long m_entries_count;
+  long ParseSimpleBlock(long long, long long&, long&);
+  long ParseBlockGroup(long long, long long&, long&);
 
-    long ParseSimpleBlock(long long, long long&, long&);
-    long ParseBlockGroup(long long, long long&, long&);
-
-    long CreateBlock(long long id, long long pos, long long size,
-                     long long discard_padding);
-    long CreateBlockGroup(long long start_offset, long long size,
-                          long long discard_padding);
-    long CreateSimpleBlock(long long, long long);
-
+  long CreateBlock(long long id, long long pos, long long size,
+                   long long discard_padding);
+  long CreateBlockGroup(long long start_offset, long long size,
+                        long long discard_padding);
+  long CreateSimpleBlock(long long, long long);
 };
 
+class Segment {
+  friend class Cues;
+  friend class Track;
+  friend class VideoTrack;
 
-class Segment
-{
-    friend class Cues;
-    friend class Track;
-    friend class VideoTrack;
+  Segment(const Segment&);
+  Segment& operator=(const Segment&);
 
-    Segment(const Segment&);
-    Segment& operator=(const Segment&);
+ private:
+  Segment(IMkvReader*, long long elem_start,
+          // long long elem_size,
+          long long pos, long long size);
 
-private:
-    Segment(
-        IMkvReader*,
-        long long elem_start,
-        //long long elem_size,
-        long long pos,
-        long long size);
+ public:
+  IMkvReader* const m_pReader;
+  const long long m_element_start;
+  // const long long m_element_size;
+  const long long m_start;  // posn of segment payload
+  const long long m_size;  // size of segment payload
+  Cluster m_eos;  // TODO: make private?
 
-public:
-    IMkvReader* const m_pReader;
-    const long long m_element_start;
-    //const long long m_element_size;
-    const long long m_start;  //posn of segment payload
-    const long long m_size;   //size of segment payload
-    Cluster m_eos;  //TODO: make private?
+  static long long CreateInstance(IMkvReader*, long long, Segment*&);
+  ~Segment();
 
-    static long long CreateInstance(IMkvReader*, long long, Segment*&);
-    ~Segment();
+  long Load();  // loads headers and all clusters
 
-    long Load();  //loads headers and all clusters
+  // for incremental loading
+  // long long Unparsed() const;
+  bool DoneParsing() const;
+  long long ParseHeaders();  // stops when first cluster is found
+  // long FindNextCluster(long long& pos, long& size) const;
+  long LoadCluster(long long& pos, long& size);  // load one cluster
+  long LoadCluster();
 
-    //for incremental loading
-    //long long Unparsed() const;
-    bool DoneParsing() const;
-    long long ParseHeaders();  //stops when first cluster is found
-    //long FindNextCluster(long long& pos, long& size) const;
-    long LoadCluster(long long& pos, long& size);  //load one cluster
-    long LoadCluster();
-
-    long ParseNext(
-            const Cluster* pCurr,
-            const Cluster*& pNext,
-            long long& pos,
-            long& size);
+  long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
+                 long& size);
 
 #if 0
     //This pair parses one cluster, but only changes the state of the
@@ -1011,69 +884,62 @@
     bool AddCluster(long long cluster_pos, long long new_pos);
 #endif
 
-    const SeekHead* GetSeekHead() const;
-    const Tracks* GetTracks() const;
-    const SegmentInfo* GetInfo() const;
-    const Cues* GetCues() const;
-    const Chapters* GetChapters() const;
+  const SeekHead* GetSeekHead() const;
+  const Tracks* GetTracks() const;
+  const SegmentInfo* GetInfo() const;
+  const Cues* GetCues() const;
+  const Chapters* GetChapters() const;
 
-    long long GetDuration() const;
+  long long GetDuration() const;
 
-    unsigned long GetCount() const;
-    const Cluster* GetFirst() const;
-    const Cluster* GetLast() const;
-    const Cluster* GetNext(const Cluster*);
+  unsigned long GetCount() const;
+  const Cluster* GetFirst() const;
+  const Cluster* GetLast() const;
+  const Cluster* GetNext(const Cluster*);
 
-    const Cluster* FindCluster(long long time_nanoseconds) const;
-    //const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
+  const Cluster* FindCluster(long long time_nanoseconds) const;
+  // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
 
-    const Cluster* FindOrPreloadCluster(long long pos);
+  const Cluster* FindOrPreloadCluster(long long pos);
 
-    long ParseCues(
-        long long cues_off,  //offset relative to start of segment
-        long long& parse_pos,
-        long& parse_len);
+  long ParseCues(long long cues_off,  // offset relative to start of segment
+                 long long& parse_pos, long& parse_len);
 
-private:
+ private:
+  long long m_pos;  // absolute file posn; what has been consumed so far
+  Cluster* m_pUnknownSize;
 
-    long long m_pos;  //absolute file posn; what has been consumed so far
-    Cluster* m_pUnknownSize;
+  SeekHead* m_pSeekHead;
+  SegmentInfo* m_pInfo;
+  Tracks* m_pTracks;
+  Cues* m_pCues;
+  Chapters* m_pChapters;
+  Cluster** m_clusters;
+  long m_clusterCount;  // number of entries for which m_index >= 0
+  long m_clusterPreloadCount;  // number of entries for which m_index < 0
+  long m_clusterSize;  // array size
 
-    SeekHead* m_pSeekHead;
-    SegmentInfo* m_pInfo;
-    Tracks* m_pTracks;
-    Cues* m_pCues;
-    Chapters* m_pChapters;
-    Cluster** m_clusters;
-    long m_clusterCount;         //number of entries for which m_index >= 0
-    long m_clusterPreloadCount;  //number of entries for which m_index < 0
-    long m_clusterSize;          //array size
+  long DoLoadCluster(long long&, long&);
+  long DoLoadClusterUnknownSize(long long&, long&);
+  long DoParseNext(const Cluster*&, long long&, long&);
 
-    long DoLoadCluster(long long&, long&);
-    long DoLoadClusterUnknownSize(long long&, long&);
-    long DoParseNext(const Cluster*&, long long&, long&);
+  void AppendCluster(Cluster*);
+  void PreloadCluster(Cluster*, ptrdiff_t);
 
-    void AppendCluster(Cluster*);
-    void PreloadCluster(Cluster*, ptrdiff_t);
+  // void ParseSeekHead(long long pos, long long size);
+  // void ParseSeekEntry(long long pos, long long size);
+  // void ParseCues(long long);
 
-    //void ParseSeekHead(long long pos, long long size);
-    //void ParseSeekEntry(long long pos, long long size);
-    //void ParseCues(long long);
-
-    const BlockEntry* GetBlock(
-        const CuePoint&,
-        const CuePoint::TrackPosition&);
-
+  const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
 };
 
-}  //end namespace mkvparser
+}  // end namespace mkvparser
 
-inline long mkvparser::Segment::LoadCluster()
-{
-    long long pos;
-    long size;
+inline long mkvparser::Segment::LoadCluster() {
+  long long pos;
+  long size;
 
-    return LoadCluster(pos, size);
+  return LoadCluster(pos, size);
 }
 
-#endif  //MKVPARSER_HPP
+#endif  // MKVPARSER_HPP
diff --git a/third_party/libwebm/mkvreader.cpp b/third_party/libwebm/mkvreader.cpp
index b4b2459..eaf9e0a 100644
--- a/third_party/libwebm/mkvreader.cpp
+++ b/third_party/libwebm/mkvreader.cpp
@@ -10,17 +10,11 @@
 
 #include <cassert>
 
-namespace mkvparser
-{
+namespace mkvparser {
 
-MkvReader::MkvReader() :
-    m_file(NULL),
-    reader_owns_file_(true) {
-}
+MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {}
 
-MkvReader::MkvReader(FILE* fp) :
-    m_file(fp),
-    reader_owns_file_(false) {
+MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) {
   GetFileSize();
 }
 
@@ -30,114 +24,109 @@
   m_file = NULL;
 }
 
-int MkvReader::Open(const char* fileName)
-{
-    if (fileName == NULL)
-        return -1;
+int MkvReader::Open(const char* fileName) {
+  if (fileName == NULL)
+    return -1;
 
-    if (m_file)
-        return -1;
+  if (m_file)
+    return -1;
 
 #ifdef _MSC_VER
-    const errno_t e = fopen_s(&m_file, fileName, "rb");
+  const errno_t e = fopen_s(&m_file, fileName, "rb");
 
-    if (e)
-        return -1;  //error
+  if (e)
+    return -1;  // error
 #else
-    m_file = fopen(fileName, "rb");
+  m_file = fopen(fileName, "rb");
 
-    if (m_file == NULL)
-        return -1;
+  if (m_file == NULL)
+    return -1;
 #endif
-    return !GetFileSize();
+  return !GetFileSize();
 }
 
 bool MkvReader::GetFileSize() {
-    if (m_file == NULL)
-        return false;
+  if (m_file == NULL)
+    return false;
 #ifdef _MSC_VER
-    int status = _fseeki64(m_file, 0L, SEEK_END);
+  int status = _fseeki64(m_file, 0L, SEEK_END);
 
-    if (status)
-        return false;  //error
+  if (status)
+    return false;  // error
 
-    m_length = _ftelli64(m_file);
+  m_length = _ftelli64(m_file);
 #else
-    fseek(m_file, 0L, SEEK_END);
-    m_length = ftell(m_file);
+  fseek(m_file, 0L, SEEK_END);
+  m_length = ftell(m_file);
 #endif
-    assert(m_length >= 0);
+  assert(m_length >= 0);
 
-    if (m_length < 0)
-        return false;
+  if (m_length < 0)
+    return false;
 
 #ifdef _MSC_VER
-    status = _fseeki64(m_file, 0L, SEEK_SET);
+  status = _fseeki64(m_file, 0L, SEEK_SET);
 
-    if (status)
-        return false;  //error
+  if (status)
+    return false;  // error
 #else
-    fseek(m_file, 0L, SEEK_SET);
+  fseek(m_file, 0L, SEEK_SET);
 #endif
 
-    return true;
+  return true;
 }
 
-void MkvReader::Close()
-{
-    if (m_file != NULL)
-    {
-        fclose(m_file);
-        m_file = NULL;
-    }
+void MkvReader::Close() {
+  if (m_file != NULL) {
+    fclose(m_file);
+    m_file = NULL;
+  }
 }
 
-int MkvReader::Length(long long* total, long long* available)
-{
-    if (m_file == NULL)
-        return -1;
+int MkvReader::Length(long long* total, long long* available) {
+  if (m_file == NULL)
+    return -1;
 
-    if (total)
-        *total = m_length;
+  if (total)
+    *total = m_length;
 
-    if (available)
-        *available = m_length;
+  if (available)
+    *available = m_length;
 
+  return 0;
+}
+
+int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
+  if (m_file == NULL)
+    return -1;
+
+  if (offset < 0)
+    return -1;
+
+  if (len < 0)
+    return -1;
+
+  if (len == 0)
     return 0;
-}
 
-int MkvReader::Read(long long offset, long len, unsigned char* buffer)
-{
-    if (m_file == NULL)
-        return -1;
-
-    if (offset < 0)
-        return -1;
-
-    if (len < 0)
-        return -1;
-
-    if (len == 0)
-        return 0;
-
-    if (offset >= m_length)
-        return -1;
+  if (offset >= m_length)
+    return -1;
 
 #ifdef _MSC_VER
-    const int status = _fseeki64(m_file, offset, SEEK_SET);
+  const int status = _fseeki64(m_file, offset, SEEK_SET);
 
-    if (status)
-        return -1;  //error
+  if (status)
+    return -1;  // error
 #else
-    fseek(m_file, offset, SEEK_SET);
+  fseek(m_file, offset, SEEK_SET);
 #endif
 
-    const size_t size = fread(buffer, 1, len, m_file);
+  const size_t size = fread(buffer, 1, len, m_file);
 
-    if (size < size_t(len))
-        return -1;  //error
+  if (size < size_t(len))
+    return -1;  // error
 
-    return 0;  //success
+  return 0;  // success
 }
 
-}  //end namespace mkvparser
+}  // end namespace mkvparser
diff --git a/third_party/libwebm/mkvreader.hpp b/third_party/libwebm/mkvreader.hpp
index 8ebdd99..82ebad5 100644
--- a/third_party/libwebm/mkvreader.hpp
+++ b/third_party/libwebm/mkvreader.hpp
@@ -12,35 +12,34 @@
 #include "mkvparser.hpp"
 #include <cstdio>
 
-namespace mkvparser
-{
+namespace mkvparser {
 
-class MkvReader : public IMkvReader
-{
-    MkvReader(const MkvReader&);
-    MkvReader& operator=(const MkvReader&);
-public:
-    MkvReader();
-    MkvReader(FILE* fp);
-    virtual ~MkvReader();
+class MkvReader : public IMkvReader {
+ public:
+  MkvReader();
+  explicit MkvReader(FILE* fp);
+  virtual ~MkvReader();
 
-    int Open(const char*);
-    void Close();
+  int Open(const char*);
+  void Close();
 
-    virtual int Read(long long position, long length, unsigned char* buffer);
-    virtual int Length(long long* total, long long* available);
-private:
+  virtual int Read(long long position, long length, unsigned char* buffer);
+  virtual int Length(long long* total, long long* available);
 
-    // Determines the size of the file. This is called either by the constructor
-    // or by the Open function depending on file ownership. Returns true on
-    // success.
-    bool GetFileSize();
+ private:
+  MkvReader(const MkvReader&);
+  MkvReader& operator=(const MkvReader&);
 
-    long long m_length;
-    FILE* m_file;
-    bool reader_owns_file_;
+  // Determines the size of the file. This is called either by the constructor
+  // or by the Open function depending on file ownership. Returns true on
+  // success.
+  bool GetFileSize();
+
+  long long m_length;
+  FILE* m_file;
+  bool reader_owns_file_;
 };
 
-}  //end namespace mkvparser
+}  // end namespace mkvparser
 
-#endif //MKVREADER_HPP
+#endif  // MKVREADER_HPP
diff --git a/third_party/libwebm/mkvwriter.cpp b/third_party/libwebm/mkvwriter.cpp
index 8de89a4..75d4350 100644
--- a/third_party/libwebm/mkvwriter.cpp
+++ b/third_party/libwebm/mkvwriter.cpp
@@ -16,15 +16,11 @@
 
 namespace mkvmuxer {
 
-MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {
-}
+MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
 
-MkvWriter::MkvWriter(FILE* fp): file_(fp), writer_owns_file_(false) {
-}
+MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {}
 
-MkvWriter::~MkvWriter() {
-  Close();
-}
+MkvWriter::~MkvWriter() { Close(); }
 
 int32 MkvWriter::Write(const void* buffer, uint32 length) {
   if (!file_)
@@ -70,9 +66,9 @@
     return 0;
 
 #ifdef _MSC_VER
-    return _ftelli64(file_);
+  return _ftelli64(file_);
 #else
-    return ftell(file_);
+  return ftell(file_);
 #endif
 }
 
@@ -81,17 +77,14 @@
     return -1;
 
 #ifdef _MSC_VER
-    return _fseeki64(file_, position, SEEK_SET);
+  return _fseeki64(file_, position, SEEK_SET);
 #else
-    return fseek(file_, position, SEEK_SET);
+  return fseek(file_, position, SEEK_SET);
 #endif
 }
 
-bool MkvWriter::Seekable() const {
-  return true;
-}
+bool MkvWriter::Seekable() const { return true; }
 
-void MkvWriter::ElementStartNotify(uint64, int64) {
-}
+void MkvWriter::ElementStartNotify(uint64, int64) {}
 
 }  // namespace mkvmuxer
diff --git a/third_party/libwebm/mkvwriter.hpp b/third_party/libwebm/mkvwriter.hpp
index 524e0f7..684560c 100644
--- a/third_party/libwebm/mkvwriter.hpp
+++ b/third_party/libwebm/mkvwriter.hpp
@@ -20,7 +20,7 @@
 class MkvWriter : public IMkvWriter {
  public:
   MkvWriter();
-  MkvWriter(FILE* fp);
+  explicit MkvWriter(FILE* fp);
   virtual ~MkvWriter();
 
   // IMkvWriter interface
@@ -46,6 +46,6 @@
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
 };
 
-}  //end namespace mkvmuxer
+}  // end namespace mkvmuxer
 
-#endif // MKVWRITER_HPP
+#endif  // MKVWRITER_HPP
diff --git a/third_party/libwebm/webmids.hpp b/third_party/libwebm/webmids.hpp
index 65fab96..eeb52d8 100644
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/webmids.hpp
@@ -12,130 +12,130 @@
 namespace mkvmuxer {
 
 enum MkvId {
-  kMkvEBML                    = 0x1A45DFA3,
-  kMkvEBMLVersion             = 0x4286,
-  kMkvEBMLReadVersion         = 0x42F7,
-  kMkvEBMLMaxIDLength         = 0x42F2,
-  kMkvEBMLMaxSizeLength       = 0x42F3,
-  kMkvDocType                 = 0x4282,
-  kMkvDocTypeVersion          = 0x4287,
-  kMkvDocTypeReadVersion      = 0x4285,
-  kMkvVoid                    = 0xEC,
-  kMkvSignatureSlot           = 0x1B538667,
-  kMkvSignatureAlgo           = 0x7E8A,
-  kMkvSignatureHash           = 0x7E9A,
-  kMkvSignaturePublicKey      = 0x7EA5,
-  kMkvSignature               = 0x7EB5,
-  kMkvSignatureElements       = 0x7E5B,
-  kMkvSignatureElementList    = 0x7E7B,
-  kMkvSignedElement           = 0x6532,
-  //segment
-  kMkvSegment                 = 0x18538067,
-  //Meta Seek Information
-  kMkvSeekHead                = 0x114D9B74,
-  kMkvSeek                    = 0x4DBB,
-  kMkvSeekID                  = 0x53AB,
-  kMkvSeekPosition            = 0x53AC,
-  //Segment Information
-  kMkvInfo                    = 0x1549A966,
-  kMkvTimecodeScale           = 0x2AD7B1,
-  kMkvDuration                = 0x4489,
-  kMkvDateUTC                 = 0x4461,
-  kMkvMuxingApp               = 0x4D80,
-  kMkvWritingApp              = 0x5741,
-  //Cluster
-  kMkvCluster                 = 0x1F43B675,
-  kMkvTimecode                = 0xE7,
-  kMkvPrevSize                = 0xAB,
-  kMkvBlockGroup              = 0xA0,
-  kMkvBlock                   = 0xA1,
-  kMkvBlockDuration           = 0x9B,
-  kMkvReferenceBlock          = 0xFB,
-  kMkvLaceNumber              = 0xCC,
-  kMkvSimpleBlock             = 0xA3,
-  kMkvBlockAdditions          = 0x75A1,
-  kMkvBlockMore               = 0xA6,
-  kMkvBlockAddID              = 0xEE,
-  kMkvBlockAdditional         = 0xA5,
-  kMkvDiscardPadding          = 0x75A2,
-  //Track
-  kMkvTracks                  = 0x1654AE6B,
-  kMkvTrackEntry              = 0xAE,
-  kMkvTrackNumber             = 0xD7,
-  kMkvTrackUID                = 0x73C5,
-  kMkvTrackType               = 0x83,
-  kMkvFlagEnabled             = 0xB9,
-  kMkvFlagDefault             = 0x88,
-  kMkvFlagForced              = 0x55AA,
-  kMkvFlagLacing              = 0x9C,
-  kMkvDefaultDuration         = 0x23E383,
-  kMkvMaxBlockAdditionID      = 0x55EE,
-  kMkvName                    = 0x536E,
-  kMkvLanguage                = 0x22B59C,
-  kMkvCodecID                 = 0x86,
-  kMkvCodecPrivate            = 0x63A2,
-  kMkvCodecName               = 0x258688,
-  kMkvCodecDelay              = 0x56AA,
-  kMkvSeekPreRoll             = 0x56BB,
-  //video
-  kMkvVideo                   = 0xE0,
-  kMkvFlagInterlaced          = 0x9A,
-  kMkvStereoMode              = 0x53B8,
-  kMkvAlphaMode               = 0x53C0,
-  kMkvPixelWidth              = 0xB0,
-  kMkvPixelHeight             = 0xBA,
-  kMkvPixelCropBottom         = 0x54AA,
-  kMkvPixelCropTop            = 0x54BB,
-  kMkvPixelCropLeft           = 0x54CC,
-  kMkvPixelCropRight          = 0x54DD,
-  kMkvDisplayWidth            = 0x54B0,
-  kMkvDisplayHeight           = 0x54BA,
-  kMkvDisplayUnit             = 0x54B2,
-  kMkvAspectRatioType         = 0x54B3,
-  kMkvFrameRate               = 0x2383E3,
-  //end video
-  //audio
-  kMkvAudio                   = 0xE1,
-  kMkvSamplingFrequency       = 0xB5,
+  kMkvEBML = 0x1A45DFA3,
+  kMkvEBMLVersion = 0x4286,
+  kMkvEBMLReadVersion = 0x42F7,
+  kMkvEBMLMaxIDLength = 0x42F2,
+  kMkvEBMLMaxSizeLength = 0x42F3,
+  kMkvDocType = 0x4282,
+  kMkvDocTypeVersion = 0x4287,
+  kMkvDocTypeReadVersion = 0x4285,
+  kMkvVoid = 0xEC,
+  kMkvSignatureSlot = 0x1B538667,
+  kMkvSignatureAlgo = 0x7E8A,
+  kMkvSignatureHash = 0x7E9A,
+  kMkvSignaturePublicKey = 0x7EA5,
+  kMkvSignature = 0x7EB5,
+  kMkvSignatureElements = 0x7E5B,
+  kMkvSignatureElementList = 0x7E7B,
+  kMkvSignedElement = 0x6532,
+  // segment
+  kMkvSegment = 0x18538067,
+  // Meta Seek Information
+  kMkvSeekHead = 0x114D9B74,
+  kMkvSeek = 0x4DBB,
+  kMkvSeekID = 0x53AB,
+  kMkvSeekPosition = 0x53AC,
+  // Segment Information
+  kMkvInfo = 0x1549A966,
+  kMkvTimecodeScale = 0x2AD7B1,
+  kMkvDuration = 0x4489,
+  kMkvDateUTC = 0x4461,
+  kMkvMuxingApp = 0x4D80,
+  kMkvWritingApp = 0x5741,
+  // Cluster
+  kMkvCluster = 0x1F43B675,
+  kMkvTimecode = 0xE7,
+  kMkvPrevSize = 0xAB,
+  kMkvBlockGroup = 0xA0,
+  kMkvBlock = 0xA1,
+  kMkvBlockDuration = 0x9B,
+  kMkvReferenceBlock = 0xFB,
+  kMkvLaceNumber = 0xCC,
+  kMkvSimpleBlock = 0xA3,
+  kMkvBlockAdditions = 0x75A1,
+  kMkvBlockMore = 0xA6,
+  kMkvBlockAddID = 0xEE,
+  kMkvBlockAdditional = 0xA5,
+  kMkvDiscardPadding = 0x75A2,
+  // Track
+  kMkvTracks = 0x1654AE6B,
+  kMkvTrackEntry = 0xAE,
+  kMkvTrackNumber = 0xD7,
+  kMkvTrackUID = 0x73C5,
+  kMkvTrackType = 0x83,
+  kMkvFlagEnabled = 0xB9,
+  kMkvFlagDefault = 0x88,
+  kMkvFlagForced = 0x55AA,
+  kMkvFlagLacing = 0x9C,
+  kMkvDefaultDuration = 0x23E383,
+  kMkvMaxBlockAdditionID = 0x55EE,
+  kMkvName = 0x536E,
+  kMkvLanguage = 0x22B59C,
+  kMkvCodecID = 0x86,
+  kMkvCodecPrivate = 0x63A2,
+  kMkvCodecName = 0x258688,
+  kMkvCodecDelay = 0x56AA,
+  kMkvSeekPreRoll = 0x56BB,
+  // video
+  kMkvVideo = 0xE0,
+  kMkvFlagInterlaced = 0x9A,
+  kMkvStereoMode = 0x53B8,
+  kMkvAlphaMode = 0x53C0,
+  kMkvPixelWidth = 0xB0,
+  kMkvPixelHeight = 0xBA,
+  kMkvPixelCropBottom = 0x54AA,
+  kMkvPixelCropTop = 0x54BB,
+  kMkvPixelCropLeft = 0x54CC,
+  kMkvPixelCropRight = 0x54DD,
+  kMkvDisplayWidth = 0x54B0,
+  kMkvDisplayHeight = 0x54BA,
+  kMkvDisplayUnit = 0x54B2,
+  kMkvAspectRatioType = 0x54B3,
+  kMkvFrameRate = 0x2383E3,
+  // end video
+  // audio
+  kMkvAudio = 0xE1,
+  kMkvSamplingFrequency = 0xB5,
   kMkvOutputSamplingFrequency = 0x78B5,
-  kMkvChannels                = 0x9F,
-  kMkvBitDepth                = 0x6264,
-  //end audio
-  //ContentEncodings
-  kMkvContentEncodings        = 0x6D80,
-  kMkvContentEncoding         = 0x6240,
-  kMkvContentEncodingOrder    = 0x5031,
-  kMkvContentEncodingScope    = 0x5032,
-  kMkvContentEncodingType     = 0x5033,
-  kMkvContentEncryption       = 0x5035,
-  kMkvContentEncAlgo          = 0x47E1,
-  kMkvContentEncKeyID         = 0x47E2,
-  kMkvContentEncAESSettings   = 0x47E7,
-  kMkvAESSettingsCipherMode   = 0x47E8,
+  kMkvChannels = 0x9F,
+  kMkvBitDepth = 0x6264,
+  // end audio
+  // ContentEncodings
+  kMkvContentEncodings = 0x6D80,
+  kMkvContentEncoding = 0x6240,
+  kMkvContentEncodingOrder = 0x5031,
+  kMkvContentEncodingScope = 0x5032,
+  kMkvContentEncodingType = 0x5033,
+  kMkvContentEncryption = 0x5035,
+  kMkvContentEncAlgo = 0x47E1,
+  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentEncAESSettings = 0x47E7,
+  kMkvAESSettingsCipherMode = 0x47E8,
   kMkvAESSettingsCipherInitData = 0x47E9,
-  //end ContentEncodings
-  //Cueing Data
-  kMkvCues                    = 0x1C53BB6B,
-  kMkvCuePoint                = 0xBB,
-  kMkvCueTime                 = 0xB3,
-  kMkvCueTrackPositions       = 0xB7,
-  kMkvCueTrack                = 0xF7,
-  kMkvCueClusterPosition      = 0xF1,
-  kMkvCueBlockNumber          = 0x5378,
-  //Chapters
-  kMkvChapters                = 0x1043A770,
-  kMkvEditionEntry            = 0x45B9,
-  kMkvChapterAtom             = 0xB6,
-  kMkvChapterUID              = 0x73C4,
-  kMkvChapterStringUID        = 0x5654,
-  kMkvChapterTimeStart        = 0x91,
-  kMkvChapterTimeEnd          = 0x92,
-  kMkvChapterDisplay          = 0x80,
-  kMkvChapString              = 0x85,
-  kMkvChapLanguage            = 0x437C,
-  kMkvChapCountry             = 0x437E
+  // end ContentEncodings
+  // Cueing Data
+  kMkvCues = 0x1C53BB6B,
+  kMkvCuePoint = 0xBB,
+  kMkvCueTime = 0xB3,
+  kMkvCueTrackPositions = 0xB7,
+  kMkvCueTrack = 0xF7,
+  kMkvCueClusterPosition = 0xF1,
+  kMkvCueBlockNumber = 0x5378,
+  // Chapters
+  kMkvChapters = 0x1043A770,
+  kMkvEditionEntry = 0x45B9,
+  kMkvChapterAtom = 0xB6,
+  kMkvChapterUID = 0x73C4,
+  kMkvChapterStringUID = 0x5654,
+  kMkvChapterTimeStart = 0x91,
+  kMkvChapterTimeEnd = 0x92,
+  kMkvChapterDisplay = 0x80,
+  kMkvChapString = 0x85,
+  kMkvChapLanguage = 0x437C,
+  kMkvChapCountry = 0x437E
 };
 
 }  // end namespace mkvmuxer
 
-#endif // WEBMIDS_HPP
+#endif  // WEBMIDS_HPP
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 9e4918a..5b22b94 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -592,16 +592,20 @@
         CAT_XDEFINE cglobaled_, %1, 1
     %endif
     %xdefine current_function %1
-    %ifidn __OUTPUT_FORMAT__,elf
-        global %1:function hidden
-    %elifidn __OUTPUT_FORMAT__,elf32
-        global %1:function hidden
-    %elifidn __OUTPUT_FORMAT__,elf64
-        global %1:function hidden
-    %elifidn __OUTPUT_FORMAT__,macho32
-        global %1:private_extern
-    %elifidn __OUTPUT_FORMAT__,macho64
-        global %1:private_extern
+    %ifdef CHROMIUM
+        %ifidn __OUTPUT_FORMAT__,elf
+            global %1:function hidden
+        %elifidn __OUTPUT_FORMAT__,elf32
+            global %1:function hidden
+        %elifidn __OUTPUT_FORMAT__,elf64
+            global %1:function hidden
+        %elifidn __OUTPUT_FORMAT__,macho32
+            global %1:private_extern
+        %elifidn __OUTPUT_FORMAT__,macho64
+            global %1:private_extern
+        %else
+            global %1
+        %endif
     %else
         global %1
     %endif
diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
index 70e72aa..3e37e08 100644
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -12,7 +12,7 @@
 #include "vpx_config.h"
 #include "vp8/common/blockd.h"
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
 #endif
 
@@ -20,7 +20,7 @@
 extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
 #endif
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 
 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
 {
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 3bdc967..5d693c6 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -25,7 +25,7 @@
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
 #endif
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 typedef void loopfilter_y_neon(unsigned char *src, int pitch,
         unsigned char blimit, unsigned char limit, unsigned char thresh);
 typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
@@ -118,7 +118,7 @@
 }
 #endif
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
index e3ea91f..a8730aa 100644
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -26,6 +26,7 @@
 
 |vp8_build_intra_predictors_mby_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     cmp             r3, #0
     beq             case_dc_pred
@@ -37,8 +38,8 @@
     beq             case_tm_pred
 
 case_dc_pred
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -143,6 +144,7 @@
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred
     ; Copy down above row
@@ -165,6 +167,7 @@
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred
@@ -224,6 +227,7 @@
     vst1.u8         {q2}, [r1]!
     vst1.u8         {q3}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred
@@ -293,6 +297,7 @@
     subs            r12, r12, #1
     bne             case_tm_pred_loop
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
@@ -307,6 +312,7 @@
 
 |vp8_build_intra_predictors_mby_s_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
 
@@ -320,8 +326,8 @@
     beq             case_tm_pred_s
 
 case_dc_pred_s
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -426,6 +432,7 @@
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred_s
     ; Copy down above row
@@ -448,6 +455,8 @@
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
+
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred_s
@@ -507,6 +516,7 @@
     vst1.u8         {q2}, [r1], r2
     vst1.u8         {q3}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred_s
@@ -576,6 +586,7 @@
     subs            r12, r12, #1
     bne             case_tm_pred_loop_s
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
deleted file mode 100644
index 6c29c55..0000000
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,79 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq,
-;                            unsigned char *dst, int stride);
-; r0   *q
-; r1   dq
-; r2   *dst
-; r3   stride
-|idct_dequant_0_2x_neon| PROC
-    push            {r4, r5}
-
-    add             r12, r2, #4
-    vld1.32         {d2[0]}, [r2], r3
-    vld1.32         {d8[0]}, [r12], r3
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d4[1]}, [r2], r3
-    vld1.32         {d10[1]}, [r12], r3
-
-    ldrh            r12, [r0]               ; lo q
-    ldrh            r4, [r0, #32]           ; hi q
-    mov             r5, #0
-    strh            r5, [r0]
-    strh            r5, [r0, #32]
-
-    sxth            r12, r12                ; lo
-    mul             r0, r12, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q0, r0
-    sxth            r4, r4                  ; hi
-    mul             r0, r4, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r0, r2, #4
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d10[1]}, [r0]
-
-    pop             {r4, r5}
-    bx              lr
-
-    ENDP            ; |idct_dequant_0_2x_neon|
-    END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000..967c322
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+        int16_t *q,
+        int16_t dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int i, a0, a1;
+    int16x8x2_t q2Add;
+    int32x2_t d2s32, d4s32;
+    uint8x8_t d2u8, d4u8;
+    uint16x8_t q1u16, q2u16;
+
+    a0 = ((q[0] * dq) + 4) >> 3;
+    a1 = ((q[16] * dq) + 4) >> 3;
+    q[0] = q[16] = 0;
+    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+    for (i = 0; i < 2; i++, dst += 4) {
+        dst0 = dst;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d2s32));
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d4s32));
+
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+        d2s32 = vreinterpret_s32_u8(d2u8);
+        d4s32 = vreinterpret_s32_u8(d4u8);
+
+        dst0 = dst;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+    }
+    return;
+}
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
deleted file mode 100644
index d5dce63..0000000
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq,
-;                               unsigned char *dst, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-|idct_dequant_full_2x_neon| PROC
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2],  r3     ; l pre
-    vld1.32         {d28[1]}, [r12], r3     ; r pre
-    vld1.32         {d29[0]}, [r2],  r3
-    vld1.32         {d29[1]}, [r12], r3
-    vld1.32         {d30[0]}, [r2],  r3
-    vld1.32         {d30[1]}, [r12], r3
-    vld1.32         {d31[0]}, [r2],  r3
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    vld1.16         {d0}, [r1]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r1, r2, #4              ; hi
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    vst1.32         {d0[0]}, [r2], r3       ; lo
-    vst1.32         {d0[1]}, [r1], r3       ; hi
-    vst1.32         {d1[0]}, [r2], r3
-    vst1.32         {d1[1]}, [r1], r3
-    vst1.32         {d2[0]}, [r2], r3
-    vst1.32         {d2[1]}, [r1], r3
-    vst1.32         {d3[0]}, [r2]
-    vst1.32         {d3[1]}, [r1]
-
-    bx             lr
-
-    ENDP           ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000..a60ed46
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+        int16_t *q,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0, *dst1;
+    int32x2_t d28, d29, d30, d31;
+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x4x2_t q2tmp0, q2tmp1;
+    int16x8x2_t q2tmp2, q2tmp3;
+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+    d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+    // load dq
+    q0 = vld1q_s16(dq);
+    dq += 8;
+    q1 = vld1q_s16(dq);
+
+    // load q
+    q2 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q3 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q4 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q5 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+
+    // load src from dst
+    dst0 = dst;
+    dst1 = dst + 4;
+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+    q2 = vmulq_s16(q2, q0);
+    q3 = vmulq_s16(q3, q1);
+    q4 = vmulq_s16(q4, q0);
+    q5 = vmulq_s16(q5, q1);
+
+    // vswp
+    dLow0 = vget_low_s16(q2);
+    dHigh0 = vget_high_s16(q2);
+    dLow1 = vget_low_s16(q4);
+    dHigh1 = vget_high_s16(q4);
+    q2 = vcombine_s16(dLow0, dLow1);
+    q4 = vcombine_s16(dHigh0, dHigh1);
+
+    dLow0 = vget_low_s16(q3);
+    dHigh0 = vget_high_s16(q3);
+    dLow1 = vget_low_s16(q5);
+    dHigh1 = vget_high_s16(q5);
+    q3 = vcombine_s16(dLow0, dLow1);
+    q5 = vcombine_s16(dHigh0, dHigh1);
+
+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+    q10 = vqaddq_s16(q2, q3);
+    q11 = vqsubq_s16(q2, q3);
+
+    q8 = vshrq_n_s16(q8, 1);
+    q9 = vshrq_n_s16(q9, 1);
+
+    q4 = vqaddq_s16(q4, q8);
+    q5 = vqaddq_s16(q5, q9);
+
+    q2 = vqsubq_s16(q6, q5);
+    q3 = vqaddq_s16(q7, q4);
+
+    q4 = vqaddq_s16(q10, q3);
+    q5 = vqaddq_s16(q11, q2);
+    q6 = vqsubq_s16(q11, q2);
+    q7 = vqsubq_s16(q10, q3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    // loop 2
+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+    q10 = vshrq_n_s16(q10, 1);
+    q11 = vshrq_n_s16(q11, 1);
+
+    q10 = vqaddq_s16(q2tmp2.val[1], q10);
+    q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+    q8 = vqsubq_s16(q8, q11);
+    q9 = vqaddq_s16(q9, q10);
+
+    q4 = vqaddq_s16(q2, q9);
+    q5 = vqaddq_s16(q3, q8);
+    q6 = vqsubq_s16(q3, q8);
+    q7 = vqsubq_s16(q2, q9);
+
+    q4 = vrshrq_n_s16(q4, 3);
+    q5 = vrshrq_n_s16(q5, 3);
+    q6 = vrshrq_n_s16(q6, 3);
+    q7 = vrshrq_n_s16(q7, 3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+                                          vreinterpret_u8_s32(d28)));
+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+                                          vreinterpret_u8_s32(d29)));
+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+                                          vreinterpret_u8_s32(d30)));
+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+                                          vreinterpret_u8_s32(d31)));
+
+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+    dst0 = dst;
+    dst1 = dst + 4;
+    vst1_lane_s32((int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    vst1_lane_s32((int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d31, 0);
+    vst1_lane_s32((int32_t *)dst1, d31, 1);
+    return;
+}
diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm
deleted file mode 100644
index e8ea2a6..0000000
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ /dev/null
@@ -1,87 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-    EXPORT  |vp8_short_inv_walsh4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
-|vp8_short_inv_walsh4x4_neon| PROC
-
-    ; read in all four lines of values: d0->d3
-    vld1.i16 {q0-q1}, [r0@128]
-
-    ; first for loop
-    vadd.s16 d4, d0, d3 ;a = [0] + [12]
-    vadd.s16 d6, d1, d2 ;b = [4] + [8]
-    vsub.s16 d5, d0, d3 ;d = [0] - [12]
-    vsub.s16 d7, d1, d2 ;c = [4] - [8]
-
-    vadd.s16 q0, q2, q3 ; a+b d+c
-    vsub.s16 q1, q2, q3 ; a-b d-c
-
-    vtrn.32 d0, d2 ;d0:  0  1  8  9
-                   ;d2:  2  3 10 11
-    vtrn.32 d1, d3 ;d1:  4  5 12 13
-                   ;d3:  6  7 14 15
-
-    vtrn.16 d0, d1 ;d0:  0  4  8 12
-                   ;d1:  1  5  9 13
-    vtrn.16 d2, d3 ;d2:  2  6 10 14
-                   ;d3:  3  7 11 15
-
-    ; second for loop
-
-    vadd.s16 d4, d0, d3 ;a = [0] + [3]
-    vadd.s16 d6, d1, d2 ;b = [1] + [2]
-    vsub.s16 d5, d0, d3 ;d = [0] - [3]
-    vsub.s16 d7, d1, d2 ;c = [1] - [2]
-
-    vmov.i16 q8, #3
-
-    vadd.s16 q0, q2, q3 ; a+b d+c
-    vsub.s16 q1, q2, q3 ; a-b d-c
-
-    vadd.i16 q0, q0, q8 ;e/f += 3
-    vadd.i16 q1, q1, q8 ;g/h += 3
-
-    vshr.s16 q0, q0, #3 ;e/f >> 3
-    vshr.s16 q1, q1, #3 ;g/h >> 3
-
-    mov      r2, #64
-    add      r3, r1, #32
-
-    vst1.i16 d0[0], [r1],r2
-    vst1.i16 d1[0], [r3],r2
-    vst1.i16 d2[0], [r1],r2
-    vst1.i16 d3[0], [r3],r2
-
-    vst1.i16 d0[1], [r1],r2
-    vst1.i16 d1[1], [r3],r2
-    vst1.i16 d2[1], [r1],r2
-    vst1.i16 d3[1], [r3],r2
-
-    vst1.i16 d0[2], [r1],r2
-    vst1.i16 d1[2], [r3],r2
-    vst1.i16 d2[2], [r1],r2
-    vst1.i16 d3[2], [r3],r2
-
-    vst1.i16 d0[3], [r1],r2
-    vst1.i16 d1[3], [r3],r2
-    vst1.i16 d2[3], [r1]
-    vst1.i16 d3[3], [r3]
-
-    bx lr
-    ENDP    ; |vp8_short_inv_walsh4x4_neon|
-
-    END
diff --git a/vp8/common/arm/neon/iwalsh_neon.c b/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 0000000..6ea9dd7
--- /dev/null
+++ b/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_inv_walsh4x4_neon(
+        int16_t *input,
+        int16_t *mb_dqcoeff) {
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x4_t d4s16, d5s16, d6s16, d7s16;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+    int16x8_t qAdd3;
+
+    q0s16 = vld1q_s16(input);
+    q1s16 = vld1q_s16(input + 8);
+
+    // 1st for loop
+    d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+    d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+    d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+    d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    q0s16 = vaddq_s16(q2s16, q3s16);
+    q1s16 = vsubq_s16(q2s16, q3s16);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+                      vreinterpret_s32_s16(vget_low_s16(q1s16)));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+                      vreinterpret_s32_s16(vget_high_s16(q1s16)));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+                      vreinterpret_s16_s32(v2tmp3.val[0]));
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+                      vreinterpret_s16_s32(v2tmp3.val[1]));
+
+    // 2nd for loop
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    qAdd3 = vdupq_n_s16(3);
+
+    q0s16 = vaddq_s16(q2s16, q3s16);
+    q1s16 = vsubq_s16(q2s16, q3s16);
+
+    q0s16 = vaddq_s16(q0s16, qAdd3);
+    q1s16 = vaddq_s16(q1s16, qAdd3);
+
+    q0s16 = vshrq_n_s16(q0s16, 3);
+    q1s16 = vshrq_n_s16(q1s16, 3);
+
+    // store
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+    mb_dqcoeff += 16;
+    return;
+}
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index e44be0a..c4f09c7 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -24,10 +24,12 @@
 ; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1
     add         r1, r1, r1
 
@@ -52,6 +54,7 @@
     vst1.u8     {q7}, [r2@128], r1              ; store oq0
     vst1.u8     {q8}, [r12@128], r1             ; store oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
@@ -64,10 +67,12 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #4]               ; load thresh
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r12, [sp, #68]              ; load thresh
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q2, r12                     ; duplicate thresh
 
     sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
@@ -104,6 +109,7 @@
     vst1.u8     {d16}, [r0@64]                 ; store u oq1
     vst1.u8     {d17}, [r2@64]                 ; store v oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
 
@@ -120,11 +126,13 @@
 
 |vp8_loop_filter_vertical_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, #4                 ; src ptr down by 4 columns
     add         r1, r1, r1
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1, asr #1
 
     vld1.u8     {d6}, [r2], r1
@@ -194,6 +202,7 @@
     vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
 
@@ -210,9 +219,11 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_vertical_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q1, r3                      ; duplicate limit
     sub         r3, r2, #4                  ; move v pointer down by 4 columns
 
@@ -233,7 +244,7 @@
     vld1.u8     {d20}, [r12]
     vld1.u8     {d21}, [r3]
 
-    ldr        r12, [sp, #4]               ; load thresh
+    ldr        r12, [sp, #68]              ; load thresh
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -281,6 +292,7 @@
     vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
 
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
deleted file mode 100644
index adf848b..0000000
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ /dev/null
@@ -1,117 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
-    EXPORT  |vp8_loop_filter_bhs_neon|
-    EXPORT  |vp8_loop_filter_mbhs_neon|
-    ARM
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *s, PRESERVE
-; r1    int p, PRESERVE
-; q1    limit, PRESERVE
-
-|vp8_loop_filter_simple_horizontal_edge_neon| PROC
-
-    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
-
-    vld1.u8     {q7}, [r0@128], r1          ; q0
-    vld1.u8     {q5}, [r3@128], r1          ; p0
-    vld1.u8     {q8}, [r0@128]              ; q1
-    vld1.u8     {q6}, [r3@128]              ; p1
-
-    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
-    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
-
-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
-    vmov.u8     q0, #0x80                   ; 0x80
-    vmov.s16    q13, #3
-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
-
-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q3, d15, d13
-
-    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
-    vmul.s16    q3, q3, q13
-
-    vmov.u8     q10, #0x03                  ; 0x03
-    vmov.u8     q9, #0x04                   ; 0x04
-
-    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q3, q3, d9
-
-    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d9, q3
-
-    vand        q14, q4, q15                ; vp8_filter &= mask
-
-    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
-
-    sub         r0, r0, r1
-
-    ;calculate output
-    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-
-    vst1.u8     {q6}, [r3@128]              ; store op0
-    vst1.u8     {q7}, [r0@128]              ; store oq0
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_bhs_neon| PROC
-    push        {r4, lr}
-    ldrb        r3, [r2]                    ; load blim from mem
-    vdup.s8     q1, r3                      ; duplicate blim
-
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
-    bl          vp8_loop_filter_simple_horizontal_edge_neon
-    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
-    bl          vp8_loop_filter_simple_horizontal_edge_neon
-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
-    pop         {r4, lr}
-    b           vp8_loop_filter_simple_horizontal_edge_neon
-    ENDP        ;|vp8_loop_filter_bhs_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_mbhs_neon| PROC
-    ldrb        r3, [r2]                   ; load blim from mem
-    vdup.s8     q1, r3                     ; duplicate mblim
-    b           vp8_loop_filter_simple_horizontal_edge_neon
-    ENDP        ;|vp8_loop_filter_bhs_neon|
-
-    END
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 0000000..b25686f
--- /dev/null
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+        unsigned char *s,
+        int p,
+        const unsigned char *blimit) {
+    uint8_t *sp;
+    uint8x16_t qblimit, q0u8;
+    uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+    int16x8_t q2s16, q3s16, q13s16;
+    int8x8_t d8s8, d9s8;
+    int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+    qblimit = vdupq_n_u8(*blimit);
+
+    sp = s - (p << 1);
+    q5u8 = vld1q_u8(sp);
+    sp += p;
+    q6u8 = vld1q_u8(sp);
+    sp += p;
+    q7u8 = vld1q_u8(sp);
+    sp += p;
+    q8u8 = vld1q_u8(sp);
+
+    q15u8 = vabdq_u8(q6u8, q7u8);
+    q14u8 = vabdq_u8(q5u8, q8u8);
+
+    q15u8 = vqaddq_u8(q15u8, q15u8);
+    q14u8 = vshrq_n_u8(q14u8, 1);
+    q0u8 = vdupq_n_u8(0x80);
+    q13s16 = vdupq_n_s16(3);
+    q15u8 = vqaddq_u8(q15u8, q14u8);
+
+    q5u8 = veorq_u8(q5u8, q0u8);
+    q6u8 = veorq_u8(q6u8, q0u8);
+    q7u8 = veorq_u8(q7u8, q0u8);
+    q8u8 = veorq_u8(q8u8, q0u8);
+
+    q15u8 = vcgeq_u8(qblimit, q15u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+    q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+                     vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+    q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
+                     vreinterpretq_s8_u8(q8u8));
+
+    q2s16 = vmulq_s16(q2s16, q13s16);
+    q3s16 = vmulq_s16(q3s16, q13s16);
+
+    q10u8 = vdupq_n_u8(3);
+    q9u8 = vdupq_n_u8(4);
+
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+    q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+    d8s8 = vqmovn_s16(q2s16);
+    d9s8 = vqmovn_s16(q3s16);
+    q4s8 = vcombine_s8(d8s8, d9s8);
+
+    q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q3s8 = vshrq_n_s8(q3s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+    vst1q_u8(s, q7u8);
+    s -= p;
+    vst1q_u8(s, q6u8);
+    return;
+}
+
+void vp8_loop_filter_bhs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
+
+void vp8_loop_filter_mbhs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index e690df2..78d13c8 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,7 +9,6 @@
 ;
 
 
-    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
     EXPORT |vp8_loop_filter_bvs_neon|
     EXPORT |vp8_loop_filter_mbvs_neon|
     ARM
@@ -22,6 +21,8 @@
 ; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
+    vpush       {d8-d15}
+
     sub         r0, r0, #2                  ; move src pointer down by 2 columns
     add         r12, r1, r1
     add         r3, r0, r1
@@ -120,6 +121,7 @@
     vst2.8      {d14[6], d15[6]}, [r0], r12
     vst2.8      {d14[7], d15[7]}, [r3]
 
+    vpop        {d8-d15}
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
deleted file mode 100644
index f41c156..0000000
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ /dev/null
@@ -1,469 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const unsigned char *blimit,
-;                                               const unsigned char *limit,
-;                                               const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    add         r1, r1, r1                  ; double stride
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
-    vdup.u8     q2, r12                     ; thresh
-    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
-
-    vld1.u8     {q3}, [r0@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r0@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r0@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r0@128], r1              ; q2
-    vld1.u8     {q10}, [r12@128], r1            ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #2
-    add         r0, r12, r1, lsr #1
-
-    vst1.u8     {q4}, [r12@128],r1         ; store op2
-    vst1.u8     {q5}, [r0@128],r1          ; store op1
-    vst1.u8     {q6}, [r12@128], r1        ; store op0
-    vst1.u8     {q7}, [r0@128],r1          ; store oq0
-    vst1.u8     {q8}, [r12@128]            ; store oq1
-    vst1.u8     {q9}, [r0@128]             ; store oq2
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const unsigned char *blimit,
-;                                                const unsigned char *limit,
-;                                                const unsigned char *thresh,
-;                                                unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]                 ; load thresh
-    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
-    vdup.u8     q2, r12                       ; thresh
-    ldr         r12, [sp, #8]                 ; load v ptr
-    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1              ; p3
-    vld1.u8     {d8}, [r0@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1              ; p2
-    vld1.u8     {d10}, [r0@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1             ; p1
-    vld1.u8     {d12}, [r0@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1             ; p0
-    vld1.u8     {d14}, [r0@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1             ; q0
-    vld1.u8     {d16}, [r0@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1             ; q1
-    vld1.u8     {d18}, [r0@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1             ; q2
-    vld1.u8     {d20}, [r0@64], r1             ; q3
-    vld1.u8     {d21}, [r12@64], r1             ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #3
-    sub         r12, r12, r1, lsl #3
-
-    add         r0, r0, r1
-    add         r12, r12, r1
-
-    vst1.u8     {d8}, [r0@64], r1              ; store u op2
-    vst1.u8     {d9}, [r12@64], r1              ; store v op2
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r12@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r12@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
-    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
-    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
-    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const unsigned char *blimit,
-;                                             const unsigned char *limit,
-;                                             const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vdup.s8     q2, r12                     ; thresh
-    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const unsigned char *blimit,
-;                                              const unsigned char *limit,
-;                                              const unsigned char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
-    sub         r0, r0, #4                  ; move u pointer down by 4 columns
-    vdup.u8     q2, r12                     ; thresh
-    ldr         r12, [sp, #8]               ; load v ptr
-    sub         r12, r12, #4                ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r12], r1             ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-; void vp8_mbloop_filter_neon()
-; This is a helper function for the macroblock loopfilters. The individual
-; functions do the necessary load, transpose (if necessary), preserve (if
-; necessary) and store.
-
-; r0,r1 PRESERVE
-; r2    mblimit
-; r3    limit
-
-; q2    thresh
-; q3    p3 PRESERVE
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3 PRESERVE
-
-|vp8_mbloop_filter_neon| PROC
-
-    ; vp8_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q1, q1, q0
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q1
-
-    vdup.u8     q1, r3                      ; limit
-    vdup.u8     q2, r2                      ; mblimit
-
-    vmov.u8     q0, #0x80                   ; 0x80
-
-    vcge.u8     q15, q1, q15
-
-    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vmov.u16    q11, #3                     ; #3
-
-    ; vp8_filter
-    ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-
-    vqadd.u8    q12, q12, q1                ; a = b + a
-
-    veor        q8, q8, q0                  ; qs1
-    veor        q4, q4, q0                  ; ps2
-    veor        q9, q9, q0                  ; qs2
-
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
-
-    vsubl.s8    q2, d14, d12                ; qs0 - ps0
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
-
-    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
-
-    vand        q15, q15, q12               ; vp8_filter_mask
-
-    vmul.i16    q13, q13, q11
-
-    vmov.u8     q12, #3                     ; #3
-
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vmov.u8     q11, #4                     ; #4
-
-    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q13
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vmov.u16    q15, #63                    ; #63
-
-    vand        q13, q1, q14                ; Filter2 &= hev
-
-    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
-
-    vmov        q0, q15
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q11, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-
-    vmov.u8     d5, #9                      ; #9
-    vmov.u8     d4, #18                     ; #18
-
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
-    vmlal.s8    q11, d3, d5
-    vmov.u8     d5, #27                     ; #27
-    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
-    vmlal.s8    q13, d3, d4
-    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
-    vmlal.s8    q15, d3, d5
-
-    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d1, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vmov.u8     q1, #0x80                   ; 0x80
-
-    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
-    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-
-    veor        q9, q11, q1                 ; *oq2 = s^0x80
-    veor        q4, q0, q1                  ; *op2 = s^0x80
-    veor        q8, q13, q1                 ; *oq1 = s^0x80
-    veor        q5, q12, q1                 ; *op2 = s^0x80
-    veor        q7, q15, q1                 ; *oq0 = s^0x80
-    veor        q6, q14, q1                 ; *op0 = s^0x80
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_neon|
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.c b/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 0000000..5351f4b
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_mbloop_filter_neon(
+        uint8x16_t qblimit,  // mblimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p2
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q4r,     // p1
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r,     // q1
+        uint8x16_t *q9r) {   // q1
+    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+    int8x16_t q0s8, q12s8, q14s8, q15s8;
+    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q1u8  = vabdq_u8(q9, q8);
+    q0u8  = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q1u8  = vmaxq_u8(q1u8, q0u8);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q12u8 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q1u8);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    q1u8 = vabdq_u8(q5, q8);
+    q12u8 = vqaddq_u8(q12u8, q12u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q0u8 = vdupq_n_u8(0x80);
+    q9 = veorq_u8(q9, q0u8);
+    q8 = veorq_u8(q8, q0u8);
+    q7 = veorq_u8(q7, q0u8);
+    q6 = veorq_u8(q6, q0u8);
+    q5 = veorq_u8(q5, q0u8);
+    q4 = veorq_u8(q4, q0u8);
+
+    q1u8 = vshrq_n_u8(q1u8, 1);
+    q12u8 = vqaddq_u8(q12u8, q1u8);
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+    q12u8 = vcgeq_u8(qblimit, q12u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                     vreinterpretq_s8_u8(q8));
+
+    q11s16 = vdupq_n_s16(3);
+    q2s16  = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q15u8 = vandq_u8(q15u8, q12u8);
+
+    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+    q12u8 = vdupq_n_u8(3);
+    q11u8 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2 = vqmovn_s16(q2s16);
+    d3 = vqmovn_s16(q13s16);
+    q1s8 = vcombine_s8(d2, d3);
+    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q13s8 = vshrq_n_s8(q13s8, 3);
+
+    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+    d5 = vdup_n_s8(9);
+    d4 = vdup_n_s8(18);
+
+    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
+    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+    d5 = vdup_n_s8(27);
+    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
+    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
+    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+    d0  = vqshrn_n_s16(q0s16 , 7);
+    d1  = vqshrn_n_s16(q11s16, 7);
+    d24 = vqshrn_n_s16(q12s16, 7);
+    d25 = vqshrn_n_s16(q13s16, 7);
+    d28 = vqshrn_n_s16(q14s16, 7);
+    d29 = vqshrn_n_s16(q15s16, 7);
+
+    q0s8  = vcombine_s8(d0, d1);
+    q12s8 = vcombine_s8(d24, d25);
+    q14s8 = vcombine_s8(d28, d29);
+
+    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+    q15s8 = vqsubq_s8((q7s8), q14s8);
+    q14s8 = vqaddq_s8((q6s8), q14s8);
+
+    q1u8 = vdupq_n_u8(0x80);
+    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    src -= (pitch * 6);
+    vst1q_u8(src, q4);
+    src += pitch;
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    src += pitch;
+    vst1q_u8(src, q9);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6 = vld1_u8(u);
+    u += pitch;
+    d7 = vld1_u8(v);
+    v += pitch;
+    d8 = vld1_u8(u);
+    u += pitch;
+    d9 = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    u -= (pitch * 6);
+    v -= (pitch * 6);
+    vst1_u8(u, vget_low_u8(q4));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q4));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q9));
+    vst1_u8(v, vget_high_u8(q9));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s1, *s2;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s1 = src - 4;
+    s2 = s1 + 8 * pitch;
+    d6  = vld1_u8(s1);
+    s1 += pitch;
+    d7  = vld1_u8(s2);
+    s2 += pitch;
+    d8  = vld1_u8(s1);
+    s1 += pitch;
+    d9  = vld1_u8(s2);
+    s2 += pitch;
+    d10 = vld1_u8(s1);
+    s1 += pitch;
+    d11 = vld1_u8(s2);
+    s2 += pitch;
+    d12 = vld1_u8(s1);
+    s1 += pitch;
+    d13 = vld1_u8(s2);
+    s2 += pitch;
+    d14 = vld1_u8(s1);
+    s1 += pitch;
+    d15 = vld1_u8(s2);
+    s2 += pitch;
+    d16 = vld1_u8(s1);
+    s1 += pitch;
+    d17 = vld1_u8(s2);
+    s2 += pitch;
+    d18 = vld1_u8(s1);
+    s1 += pitch;
+    d19 = vld1_u8(s2);
+    s2 += pitch;
+    d20 = vld1_u8(s1);
+    d21 = vld1_u8(s2);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    s1 -= 7 * pitch;
+    s2 -= 7 * pitch;
+
+    vst1_u8(s1, vget_low_u8(q3));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q3));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q4));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q4));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q5));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q5));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q6));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q6));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q7));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q7));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q8));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q8));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q9));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q9));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q10));
+    vst1_u8(s2, vget_high_u8(q10));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    vs = v - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d20 = vld1_u8(us);
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    ud = u - 4;
+    vst1_u8(ud, vget_low_u8(q3));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q4));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q5));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q6));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q7));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q8));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q9));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q10));
+
+    vd = v - 4;
+    vst1_u8(vd, vget_high_u8(q3));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q4));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q5));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q6));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q7));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q8));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q9));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q10));
+    return;
+}
diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm
deleted file mode 100644
index d7c590e..0000000
--- a/vp8/common/arm/neon/sad16_neon.asm
+++ /dev/null
@@ -1,207 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sad16x16_neon|
-    EXPORT  |vp8_sad16x8_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int  src_stride
-; r2    unsigned char *ref_ptr
-; r3    int  ref_stride
-|vp8_sad16x16_neon| PROC
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-    vabdl.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-;;
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0]
-    vld1.8          {q7}, [r2]
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vadd.u16        q0, q12, q13
-
-    vpaddl.u16      q1, q0
-    vpaddl.u32      q0, q1
-
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;==============================
-;unsigned int vp8_sad16x8_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-|vp8_sad16x8_neon| PROC
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-    vabdl.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vld1.8          {q0}, [r0], r1
-    vld1.8          {q4}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-    vabal.u8        q13, d1, d9
-
-    vld1.8          {q2}, [r0], r1
-    vld1.8          {q6}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-    vabal.u8        q13, d3, d11
-
-    vld1.8          {q3}, [r0], r1
-    vld1.8          {q7}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q13, d5, d13
-
-    vabal.u8        q12, d6, d14
-    vabal.u8        q13, d7, d15
-
-    vadd.u16        q0, q12, q13
-
-    vpaddl.u16      q1, q0
-    vpaddl.u32      q0, q1
-
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm
deleted file mode 100644
index 23ba6df..0000000
--- a/vp8/common/arm/neon/sad8_neon.asm
+++ /dev/null
@@ -1,209 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sad8x8_neon|
-    EXPORT  |vp8_sad8x16_neon|
-    EXPORT  |vp8_sad4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; unsigned int vp8_sad8x8_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad8x8_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      q1, q12
-    vpaddl.u32      q0, q1
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;============================
-;unsigned int vp8_sad8x16_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad8x16_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vabal.u8        q12, d6, d14
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabal.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      q1, q12
-    vpaddl.u32      q0, q1
-    vadd.u32        d0, d0, d1
-
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-;===========================
-;unsigned int vp8_sad4x4_c(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-
-|vp8_sad4x4_neon| PROC
-    vld1.8          {d0}, [r0], r1
-    vld1.8          {d8}, [r2], r3
-
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d10}, [r2], r3
-
-    vabdl.u8        q12, d0, d8
-
-    vld1.8          {d4}, [r0], r1
-    vld1.8          {d12}, [r2], r3
-
-    vabal.u8        q12, d2, d10
-
-    vld1.8          {d6}, [r0], r1
-    vld1.8          {d14}, [r2], r3
-
-    vabal.u8        q12, d4, d12
-    vabal.u8        q12, d6, d14
-
-    vpaddl.u16      d1, d24
-    vpaddl.u32      d0, d1
-    vmov.32         r0, d0[0]
-
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/sad_neon.c b/vp8/common/arm/neon/sad_neon.c
new file mode 100644
index 0000000..6595ac0
--- /dev/null
+++ b/vp8/common/arm/neon/sad_neon.c
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+unsigned int vp8_sad8x8_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 7; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad8x16_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 15; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad4x4_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x2_t d1;
+    uint64x1_t d3;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 3; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    d1 = vpaddl_u16(vget_low_u16(q12));
+    d3 = vpaddl_u32(d1);
+
+    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vp8_sad16x16_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x16_t q0, q4;
+    uint16x8_t q12, q13;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+    for (i = 0; i < 15; i++) {
+        q0 = vld1q_u8(src_ptr);
+        src_ptr += src_stride;
+        q4 = vld1q_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+    }
+
+    q12 = vaddq_u16(q12, q13);
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad16x8_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x16_t q0, q4;
+    uint16x8_t q12, q13;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+    for (i = 0; i < 7; i++) {
+        q0 = vld1q_u8(src_ptr);
+        src_ptr += src_stride;
+        q4 = vld1q_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+    }
+
+    q12 = vaddq_u16(q12, q13);
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
diff --git a/vp8/common/arm/neon/save_reg_neon.asm b/vp8/common/arm/neon/save_reg_neon.asm
deleted file mode 100644
index fd7002e..0000000
--- a/vp8/common/arm/neon/save_reg_neon.asm
+++ /dev/null
@@ -1,36 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_push_neon|
-    EXPORT  |vp8_pop_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp8_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-|vp8_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-    END
-
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
deleted file mode 100644
index 67d2ab0..0000000
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ /dev/null
@@ -1,139 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_idct4x4llm_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;*************************************************************
-;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
-;                            unsigned char *dst, int stride)
-;r0 short * input
-;r1 short * pred
-;r2 int pitch
-;r3 unsigned char dst
-;sp int stride
-;*************************************************************
-
-; static const int cospi8sqrt2minus1=20091;
-; static const int sinpi8sqrt2      =35468;
-; static const int rounding = 0;
-
-; Optimization note: The resulted data from dequantization are signed
-; 13-bit data that is in the range of [-4096, 4095]. This allows to
-; use "vqdmulh"(neon) instruction since it won't go out of range
-; (13+16+1=30bits<32bits). This instruction gives the high half
-; result of the multiplication that is needed in IDCT.
-
-|vp8_short_idct4x4llm_neon| PROC
-    adr             r12, idct_coeff
-    vld1.16         {q1, q2}, [r0]
-    vld1.16         {d0}, [r12]
-
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-    ldr             r0, [sp]                ; stride
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    ;d6 - c1:temp1
-    ;d7 - d1:temp2
-    ;d8 - d1:temp1
-    ;d9 - c1:temp2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vswp            d3, d4
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    ; load prediction data
-    vld1.32         d6[0], [r1], r2
-    vld1.32         d6[1], [r1], r2
-    vld1.32         d7[0], [r1], r2
-    vld1.32         d7[1], [r1], r2
-
-    ; add prediction and residual
-    vaddw.u8        q1, q1, d6
-    vaddw.u8        q2, q2, d7
-
-    vqmovun.s16     d1, q1
-    vqmovun.s16     d2, q2
-
-    ; store to destination
-    vst1.32         d1[0], [r3], r0
-    vst1.32         d1[1], [r3], r0
-    vst1.32         d2[0], [r3], r0
-    vst1.32         d2[1], [r3], r0
-
-    bx              lr
-
-    ENDP
-
-;-----------------
-
-idct_coeff
-    DCD     0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
-
-    END
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 0000000..373afa6
--- /dev/null
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 35468;
+
+void vp8_short_idct4x4llm_neon(
+        int16_t *input,
+        unsigned char *pred_ptr,
+        int pred_stride,
+        unsigned char *dst_ptr,
+        int dst_stride) {
+    int i;
+    uint32x2_t d6u32 = vdup_n_u32(0);
+    uint8x8_t d1u8;
+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+    uint16x8_t q1u16;
+    int16x8_t q1s16, q2s16, q3s16, q4s16;
+    int32x2x2_t v2tmp0, v2tmp1;
+    int16x4x2_t v2tmp2, v2tmp3;
+
+    d2 = vld1_s16(input);
+    d3 = vld1_s16(input + 4);
+    d4 = vld1_s16(input + 8);
+    d5 = vld1_s16(input + 12);
+
+    // 1st for loop
+    q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
+    q2s16 = vcombine_s16(d3, d5);
+
+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+
+    q3s16 = vshrq_n_s16(q3s16, 1);
+    q4s16 = vshrq_n_s16(q4s16, 1);
+
+    q3s16 = vqaddq_s16(q3s16, q2s16);
+    q4s16 = vqaddq_s16(q4s16, q2s16);
+
+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                      vreinterpret_s16_s32(v2tmp1.val[0]));
+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                      vreinterpret_s16_s32(v2tmp1.val[1]));
+
+    // 2nd for loop
+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+    q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+
+    q3s16 = vshrq_n_s16(q3s16, 1);
+    q4s16 = vshrq_n_s16(q4s16, 1);
+
+    q3s16 = vqaddq_s16(q3s16, q2s16);
+    q4s16 = vqaddq_s16(q4s16, q2s16);
+
+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2 = vrshr_n_s16(d2, 3);
+    d3 = vrshr_n_s16(d3, 3);
+    d4 = vrshr_n_s16(d4, 3);
+    d5 = vrshr_n_s16(d5, 3);
+
+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                      vreinterpret_s16_s32(v2tmp1.val[0]));
+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                      vreinterpret_s16_s32(v2tmp1.val[1]));
+
+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+    q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+    // dc_only_idct_add
+    for (i = 0; i < 2; i++, q1s16 = q2s16) {
+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+        pred_ptr += pred_stride;
+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+        pred_ptr += pred_stride;
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
+                         vreinterpret_u8_u32(d6u32));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+        dst_ptr += dst_stride;
+    }
+    return;
+}
diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
deleted file mode 100644
index 9fdafd3..0000000
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ /dev/null
@@ -1,490 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict16x16_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter16_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
-; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
-; the result can be negtive. So, I treat the result as s16. But, since it is also possible
-; that the result can be a large positive number (> 2^15-1), which could be confused as a
-; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
-; which ensures that the result stays in s16 range. Finally, saturated add the result by
-; applying 3rd filter coeff. Same applys to other filter functions.
-
-|vp8_sixtap_predict16x16_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter16_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter16x16_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter16x16_only
-
-    sub             sp, sp, #336            ;reserve space on stack for temporary storage
-    mov             lr, sp
-
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #7                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (21x16)
-filt_blk2d_fp16x16_loop_neon
-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
-    vld1.u8         {d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q9, d7, d0
-    vmull.u8        q10, d9, d0
-    vmull.u8        q11, d10, d0
-    vmull.u8        q12, d12, d0
-    vmull.u8        q13, d13, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d9, d10, #1
-    vext.8          d30, d12, d13, #1
-
-    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q12, d30, d1
-
-    vext.8          d28, d7, d8, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d13, d14, #1
-
-    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q11, d29, d1
-    vmlsl.u8        q13, d30, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d9, d10, #4
-    vext.8          d30, d12, d13, #4
-
-    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q12, d30, d4
-
-    vext.8          d28, d7, d8, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d13, d14, #4
-
-    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q11, d29, d4
-    vmlsl.u8        q13, d30, d4
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d9, d10, #5
-    vext.8          d30, d12, d13, #5
-
-    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q12, d30, d5
-
-    vext.8          d28, d7, d8, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d13, d14, #5
-
-    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q11, d29, d5
-    vmlal.u8        q13, d30, d5
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d9, d10, #2
-    vext.8          d30, d12, d13, #2
-
-    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q12, d30, d2
-
-    vext.8          d28, d7, d8, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d13, d14, #2
-
-    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q11, d29, d2
-    vmlal.u8        q13, d30, d2
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d9, d10, #3
-    vext.8          d30, d12, d13, #3
-
-    vext.8          d15, d7, d8, #3
-    vext.8          d31, d10, d11, #3
-    vext.8          d6, d13, d14, #3
-
-    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-
-    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q10, q5
-    vqadd.s16       q12, q6
-
-    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q7, d31, d3
-    vmull.u8        q3, d6, d3
-
-    subs            r2, r2, #1
-
-    vqadd.s16       q9, q6
-    vqadd.s16       q11, q7
-    vqadd.s16       q13, q3
-
-    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q9, #7
-    vqrshrun.s16    d8, q10, #7
-    vqrshrun.s16    d9, q11, #7
-    vqrshrun.s16    d10, q12, #7
-    vqrshrun.s16    d11, q13, #7
-
-    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
-    vst1.u8         {d9, d10, d11}, [lr]!
-
-    bne             filt_blk2d_fp16x16_loop_neon
-
-;Second pass: 16x16
-;secondpass_filter - do first 8-columns and then second 8-columns
-    add             r3, r12, r3, lsl #5
-    sub             lr, lr, #336
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    mov             r3, #2                  ;loop counter
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    mov             r2, #16
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_sp16x16_outloop_neon
-    vld1.u8         {d18}, [lr], r2         ;load src data
-    vld1.u8         {d19}, [lr], r2
-    vld1.u8         {d20}, [lr], r2
-    vld1.u8         {d21}, [lr], r2
-    mov             r12, #4                 ;loop counter
-    vld1.u8         {d22}, [lr], r2
-
-secondpass_inner_loop_neon
-    vld1.u8         {d23}, [lr], r2         ;load src data
-    vld1.u8         {d24}, [lr], r2
-    vld1.u8         {d25}, [lr], r2
-    vld1.u8         {d26}, [lr], r2
-
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r12, r12, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q9, q11
-    vst1.u8         {d7}, [r4], r5
-    vmov            q10, q12
-    vst1.u8         {d8}, [r4], r5
-    vmov            d22, d26
-    vst1.u8         {d9}, [r4], r5
-
-    bne             secondpass_inner_loop_neon
-
-    subs            r3, r3, #1
-    sub             lr, lr, #336
-    add             lr, lr, #8
-
-    sub             r4, r4, r5, lsl #4
-    add             r4, r4, #8
-
-    bne filt_blk2d_sp16x16_outloop_neon
-
-    add             sp, sp, #336
-    pop             {r4-r5,pc}
-
-;--------------------
-firstpass_filter16x16_only
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #8                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (column-2)
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
-    vld1.u8         {d9, d10, d11}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-
-    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q7, d7, d0
-    vmull.u8        q8, d9, d0
-    vmull.u8        q9, d10, d0
-
-    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d21, d9, d10, #1
-    vext.8          d22, d7, d8, #1
-    vext.8          d23, d10, d11, #1
-    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d25, d9, d10, #4
-    vext.8          d26, d7, d8, #4
-    vext.8          d27, d10, d11, #4
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d9, d10, #5
-
-    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d21, d1
-    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q9, d23, d1
-    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d25, d4
-    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q9, d27, d4
-    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q8, d29, d5
-
-    vext.8          d20, d7, d8, #5
-    vext.8          d21, d10, d11, #5
-    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d23, d9, d10, #2
-    vext.8          d24, d7, d8, #2
-    vext.8          d25, d10, d11, #2
-
-    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d27, d9, d10, #3
-    vext.8          d28, d7, d8, #3
-    vext.8          d29, d10, d11, #3
-
-    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q9, d21, d5
-    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d23, d2
-    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q9, d25, d2
-
-    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q11, d27, d3
-    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q15, d29, d3
-
-    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q11
-    vqadd.s16       q7, q12
-    vqadd.s16       q9, q15
-
-    subs            r2, r2, #1
-
-    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q7, #7
-    vqrshrun.s16    d8, q8, #7
-    vqrshrun.s16    d9, q9, #7
-
-    vst1.u8         {q3}, [r4], r5              ;store result
-    vst1.u8         {q4}, [r4], r5
-
-    bne             filt_blk2d_fpo16x16_loop_neon
-
-    pop             {r4-r5,pc}
-
-;--------------------
-secondpass_filter16x16_only
-;Second pass: 16x16
-    add             r3, r12, r3, lsl #5
-    sub             r0, r0, r1, lsl #1
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    mov             r3, #2                  ;loop counter
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_spo16x16_outloop_neon
-    vld1.u8         {d18}, [r0], r1         ;load src data
-    vld1.u8         {d19}, [r0], r1
-    vld1.u8         {d20}, [r0], r1
-    vld1.u8         {d21}, [r0], r1
-    mov             r12, #4                 ;loop counter
-    vld1.u8         {d22}, [r0], r1
-
-secondpass_only_inner_loop_neon
-    vld1.u8         {d23}, [r0], r1         ;load src data
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r12, r12, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q9, q11
-    vst1.u8         {d7}, [r4], r5
-    vmov            q10, q12
-    vst1.u8         {d8}, [r4], r5
-    vmov            d22, d26
-    vst1.u8         {d9}, [r4], r5
-
-    bne             secondpass_only_inner_loop_neon
-
-    subs            r3, r3, #1
-    sub             r0, r0, r1, lsl #4
-    sub             r0, r0, r1, lsl #2
-    sub             r0, r0, r1
-    add             r0, r0, #8
-
-    sub             r4, r4, r5, lsl #4
-    add             r4, r4, #8
-
-    bne filt_blk2d_spo16x16_outloop_neon
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-    END
diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
deleted file mode 100644
index a4222bc..0000000
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ /dev/null
@@ -1,422 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict4x4_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter4_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(lr) int  dst_pitch
-
-|vp8_sixtap_predict4x4_neon| PROC
-    push            {r4, lr}
-
-    adr             r12, filter4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter4x4_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter4x4_only
-
-    vabs.s32        q12, q14                ;get abs(filer_parameters)
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;go back 2 columns of src data
-    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
-
-;First pass: output_height lines x output_width columns (9x4)
-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmull.u8        q8, d20, d5
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
-    vmlal.u8        q8, d10, d0
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d20, d1
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d10, d4
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d20, d2
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q10, d10, d3
-
-    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
-    vld1.u8         {q4}, [r0], r1
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-
-    vld1.u8         {q5}, [r0], r1
-    vld1.u8         {q6}, [r0], r1
-
-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d28, q8, #7
-
-    ;First Pass on rest 5-line data
-    vld1.u8         {q11}, [r0], r1
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmull.u8        q8, d20, d5
-    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp8_filter[5])
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
-    vmlal.u8        q8, d10, d0
-    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp8_filter[0])
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
-
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d20, d1
-    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp8_filter[1])
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
-
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d10, d4
-    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp8_filter[4])
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
-
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d20, d2
-    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp8_filter[2])
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q10, d10, d3
-    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp8_filter[3])
-
-    add             r3, r12, r3, lsl #5
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-    vqadd.s16       q12, q11
-
-    vext.8          d23, d27, d28, #4
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-
-    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d30, q8, #7
-    vqrshrun.s16    d31, q12, #7
-
-;Second pass: 4x4
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vext.8          d24, d28, d29, #4
-    vext.8          d25, d29, d30, #4
-    vext.8          d26, d30, d31, #4
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d28, d0
-
-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmull.u8        q6, d26, d5
-
-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d30, d4
-
-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q6, d24, d1
-
-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d29, d2
-
-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmlal.u8        q6, d25, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q6, q4
-
-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d4, q6, #7
-
-    vst1.32         {d3[0]}, [r4]           ;store result
-    vst1.32         {d3[1]}, [r0]
-    vst1.32         {d4[0]}, [r1]
-    vst1.32         {d4[1]}, [r2]
-
-    pop             {r4, pc}
-
-
-;---------------------
-firstpass_filter4x4_only
-    vabs.s32        q12, q14                ;get abs(filer_parameters)
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;go back 2 columns of src data
-
-;First pass: output_height lines x output_width columns (4x4)
-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d19, d8, d9, #5
-    vext.8          d20, d10, d11, #5
-    vext.8          d21, d12, d13, #5
-
-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
-    vswp            d11, d12
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
-    vzip.32         d20, d21
-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmull.u8        q8, d20, d5
-
-    vmov            q4, q3                  ;keep original src data in q4 q6
-    vmov            q6, q5
-
-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
-    vshr.u64        q10, q6, #8
-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
-    vmlal.u8        q8, d10, d0
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
-    vshr.u64        q5, q6, #32
-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d20, d1
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
-    vzip.32         d10, d11
-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
-    vshr.u64        q10, q6, #16
-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d10, d4
-
-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
-    vzip.32         d20, d21
-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
-    vshr.u64        q5, q6, #24
-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d20, d2
-
-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
-    vzip.32         d10, d11
-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q10, d10, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q10
-
-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d28, q8, #7
-
-    vst1.32         {d27[0]}, [r4]          ;store result
-    vst1.32         {d27[1]}, [r0]
-    vst1.32         {d28[0]}, [r1]
-    vst1.32         {d28[1]}, [r2]
-
-    pop             {r4, pc}
-
-
-;---------------------
-secondpass_filter4x4_only
-    sub             r0, r0, r1, lsl #1
-    add             r3, r12, r3, lsl #5
-
-    vld1.32         {d27[0]}, [r0], r1      ;load src data
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.32         {d27[1]}, [r0], r1
-    vabs.s32        q7, q5
-    vld1.32         {d28[0]}, [r0], r1
-    vabs.s32        q8, q6
-    vld1.32         {d28[1]}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.32         {d29[0]}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.32         {d29[1]}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.32         {d30[0]}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.32         {d30[1]}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.32         {d31[0]}, [r0], r1
-    vdup.8          d5, d16[4]
-
-    vext.8          d23, d27, d28, #4
-    vext.8          d24, d28, d29, #4
-    vext.8          d25, d29, d30, #4
-    vext.8          d26, d30, d31, #4
-
-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d28, d0
-
-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmull.u8        q6, d26, d5
-
-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d30, d4
-
-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q6, d24, d1
-
-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d29, d2
-
-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmlal.u8        q6, d25, d3
-
-    add             r0, r4, lr
-    add             r1, r0, lr
-    add             r2, r1, lr
-
-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q6, q4
-
-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d4, q6, #7
-
-    vst1.32         {d3[0]}, [r4]           ;store result
-    vst1.32         {d3[1]}, [r0]
-    vst1.32         {d4[0]}, [r1]
-    vst1.32         {d4[1]}, [r2]
-
-    pop             {r4, pc}
-
-    ENDP
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
deleted file mode 100644
index a57ec01..0000000
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ /dev/null
@@ -1,473 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict8x4_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; r4    unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-|vp8_sixtap_predict8x4_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter8_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter8x4_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter8x4_only
-
-    sub             sp, sp, #32             ;reserve space on stack for temporary storage
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    mov             lr, sp
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-
-;First pass: output_height lines x output_width columns (9x8)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vdup.8          d3, d25[4]
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d4, d26[0]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d5, d26[4]
-    vld1.u8         {q6}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vld1.u8         {q4}, [r0], r1
-    vst1.u8         {d22}, [lr]!            ;store result
-    vld1.u8         {q5}, [r0], r1
-    vst1.u8         {d23}, [lr]!
-    vld1.u8         {q6}, [r0], r1
-    vst1.u8         {d24}, [lr]!
-    vld1.u8         {q7}, [r0], r1
-    vst1.u8         {d25}, [lr]!
-
-    ;first_pass filtering on the rest 5-line data
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-    vmull.u8        q11, d12, d0
-    vmull.u8        q12, d14, d0
-
-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d28, d8, d9, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d12, d13, #1
-    vext.8          d31, d14, d15, #1
-
-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q9, d28, d1
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q11, d30, d1
-    vmlsl.u8        q12, d31, d1
-
-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d28, d8, d9, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d12, d13, #4
-    vext.8          d31, d14, d15, #4
-
-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q9, d28, d4
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q11, d30, d4
-    vmlsl.u8        q12, d31, d4
-
-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d28, d8, d9, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d12, d13, #2
-    vext.8          d31, d14, d15, #2
-
-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q9, d28, d2
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q11, d30, d2
-    vmlal.u8        q12, d31, d2
-
-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d28, d8, d9, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d12, d13, #5
-    vext.8          d31, d14, d15, #5
-
-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q9, d28, d5
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q11, d30, d5
-    vmlal.u8        q12, d31, d5
-
-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d28, d8, d9, #3
-    vext.8          d29, d10, d11, #3
-    vext.8          d30, d12, d13, #3
-    vext.8          d31, d14, d15, #3
-
-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d28, d3
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-    vmull.u8        q7, d31, d3
-
-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q9, q4
-    vqadd.s16       q10, q5
-    vqadd.s16       q11, q6
-    vqadd.s16       q12, q7
-
-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d27, q9, #7
-    vqrshrun.s16    d28, q10, #7
-    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack
-    vqrshrun.s16    d30, q12, #7
-
-;Second pass: 8x4
-;secondpass_filter
-    add             r3, r12, r3, lsl #5
-    sub             lr, lr, #32
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.u8         {q11}, [lr]!
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vld1.u8         {q12}, [lr]!
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d23, d0
-    vmull.u8        q5, d24, d0
-    vmull.u8        q6, d25, d0
-
-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d24, d1
-    vmlsl.u8        q5, d25, d1
-    vmlsl.u8        q6, d26, d1
-
-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d27, d4
-    vmlsl.u8        q5, d28, d4
-    vmlsl.u8        q6, d29, d4
-
-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d25, d2
-    vmlal.u8        q5, d26, d2
-    vmlal.u8        q6, d27, d2
-
-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d28, d5
-    vmlal.u8        q5, d29, d5
-    vmlal.u8        q6, d30, d5
-
-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d26, d3
-    vmull.u8        q9, d27, d3
-    vmull.u8        q10, d28, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vst1.u8         {d7}, [r4], r5
-    vst1.u8         {d8}, [r4], r5
-    vst1.u8         {d9}, [r4], r5
-
-    add             sp, sp, #32
-    pop             {r4-r5,pc}
-
-;--------------------
-firstpass_filter8x4_only
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d1, d24[4]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d2, d25[0]
-    vld1.u8         {q6}, [r0], r1
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First pass: output_height lines x output_width columns (4x8)
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [r4], r5         ;store result
-    vst1.u8         {d23}, [r4], r5
-    vst1.u8         {d24}, [r4], r5
-    vst1.u8         {d25}, [r4], r5
-
-    pop             {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x4_only
-;Second pass: 8x4
-    add             r3, r12, r3, lsl #5
-    sub             r0, r0, r1, lsl #1
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vld1.u8         {d22}, [r0], r1
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.u8         {d25}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.u8         {d26}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.u8         {d27}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.u8         {d28}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.u8         {d29}, [r0], r1
-    vdup.8          d5, d16[4]
-    vld1.u8         {d30}, [r0], r1
-
-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d23, d0
-    vmull.u8        q5, d24, d0
-    vmull.u8        q6, d25, d0
-
-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d24, d1
-    vmlsl.u8        q5, d25, d1
-    vmlsl.u8        q6, d26, d1
-
-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d27, d4
-    vmlsl.u8        q5, d28, d4
-    vmlsl.u8        q6, d29, d4
-
-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d25, d2
-    vmlal.u8        q5, d26, d2
-    vmlal.u8        q6, d27, d2
-
-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d28, d5
-    vmlal.u8        q5, d29, d5
-    vmlal.u8        q6, d30, d5
-
-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d26, d3
-    vmull.u8        q9, d27, d3
-    vmull.u8        q10, d28, d3
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vst1.u8         {d7}, [r4], r5
-    vst1.u8         {d8}, [r4], r5
-    vst1.u8         {d9}, [r4], r5
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
deleted file mode 100644
index 00ed5ae..0000000
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ /dev/null
@@ -1,524 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sixtap_predict8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int  dst_pitch
-
-|vp8_sixtap_predict8x8_neon| PROC
-    push            {r4-r5, lr}
-
-    adr             r12, filter8_coeff
-
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_filter8x8_only
-
-    add             r2, r12, r2, lsl #5     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
-
-    beq             firstpass_filter8x8_only
-
-    sub             sp, sp, #64             ;reserve space on stack for temporary storage
-    mov             lr, sp
-
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #2                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-    sub             r0, r0, r1, lsl #1
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-
-;First pass: output_height lines x output_width columns (13x8)
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vdup.8          d3, d25[4]
-    vld1.u8         {q4}, [r0], r1
-    vdup.8          d4, d26[0]
-    vld1.u8         {q5}, [r0], r1
-    vdup.8          d5, d26[4]
-    vld1.u8         {q6}, [r0], r1
-
-filt_blk2d_fp8x8_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
-
-    subs            r2, r2, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vld1.u8         {q3}, [r0], r1          ;load src data
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [lr]!            ;store result
-    vld1.u8         {q4}, [r0], r1
-    vst1.u8         {d23}, [lr]!
-    vld1.u8         {q5}, [r0], r1
-    vst1.u8         {d24}, [lr]!
-    vld1.u8         {q6}, [r0], r1
-    vst1.u8         {d25}, [lr]!
-
-    bne             filt_blk2d_fp8x8_loop_neon
-
-    ;first_pass filtering on the rest 5-line data
-    ;vld1.u8            {q3}, [r0], r1          ;load src data
-    ;vld1.u8            {q4}, [r0], r1
-    ;vld1.u8            {q5}, [r0], r1
-    ;vld1.u8            {q6}, [r0], r1
-    vld1.u8         {q7}, [r0], r1
-
-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-    vmull.u8        q11, d12, d0
-    vmull.u8        q12, d14, d0
-
-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d28, d8, d9, #1
-    vext.8          d29, d10, d11, #1
-    vext.8          d30, d12, d13, #1
-    vext.8          d31, d14, d15, #1
-
-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q9, d28, d1
-    vmlsl.u8        q10, d29, d1
-    vmlsl.u8        q11, d30, d1
-    vmlsl.u8        q12, d31, d1
-
-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d28, d8, d9, #4
-    vext.8          d29, d10, d11, #4
-    vext.8          d30, d12, d13, #4
-    vext.8          d31, d14, d15, #4
-
-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q9, d28, d4
-    vmlsl.u8        q10, d29, d4
-    vmlsl.u8        q11, d30, d4
-    vmlsl.u8        q12, d31, d4
-
-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d28, d8, d9, #2
-    vext.8          d29, d10, d11, #2
-    vext.8          d30, d12, d13, #2
-    vext.8          d31, d14, d15, #2
-
-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q9, d28, d2
-    vmlal.u8        q10, d29, d2
-    vmlal.u8        q11, d30, d2
-    vmlal.u8        q12, d31, d2
-
-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d28, d8, d9, #5
-    vext.8          d29, d10, d11, #5
-    vext.8          d30, d12, d13, #5
-    vext.8          d31, d14, d15, #5
-
-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q9, d28, d5
-    vmlal.u8        q10, d29, d5
-    vmlal.u8        q11, d30, d5
-    vmlal.u8        q12, d31, d5
-
-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d28, d8, d9, #3
-    vext.8          d29, d10, d11, #3
-    vext.8          d30, d12, d13, #3
-    vext.8          d31, d14, d15, #3
-
-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d28, d3
-    vmull.u8        q5, d29, d3
-    vmull.u8        q6, d30, d3
-    vmull.u8        q7, d31, d3
-
-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q9, q4
-    vqadd.s16       q10, q5
-    vqadd.s16       q11, q6
-    vqadd.s16       q12, q7
-
-    add             r3, r12, r3, lsl #5
-
-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
-    sub             lr, lr, #64
-    vqrshrun.s16    d27, q9, #7
-    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack
-    vqrshrun.s16    d28, q10, #7
-    vld1.u8         {q10}, [lr]!
-
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-
-    vqrshrun.s16    d29, q11, #7
-    vld1.u8         {q11}, [lr]!
-
-    vabs.s32        q7, q5
-    vabs.s32        q8, q6
-
-    vqrshrun.s16    d30, q12, #7
-    vld1.u8         {q12}, [lr]!
-
-;Second pass: 8x8
-    mov             r3, #2                  ;loop counter
-
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vdup.8          d1, d14[4]
-    vdup.8          d2, d15[0]
-    vdup.8          d3, d15[4]
-    vdup.8          d4, d16[0]
-    vdup.8          d5, d16[4]
-
-filt_blk2d_sp8x8_loop_neon
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r3, r3, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vmov            q9, q11
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q10, q12
-    vst1.u8         {d7}, [r4], r5
-    vmov            q11, q13
-    vst1.u8         {d8}, [r4], r5
-    vmov            q12, q14
-    vst1.u8         {d9}, [r4], r5
-    vmov            d26, d30
-
-    bne filt_blk2d_sp8x8_loop_neon
-
-    add             sp, sp, #64
-    pop             {r4-r5,pc}
-
-;---------------------
-firstpass_filter8x8_only
-    ;add                r2, r12, r2, lsl #5     ;calculate filter location
-    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter
-    vabs.s32        q12, q14
-    vabs.s32        q13, q15
-
-    mov             r2, #2                  ;loop counter
-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
-
-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
-    vdup.8          d1, d24[4]
-    vdup.8          d2, d25[0]
-    vdup.8          d3, d25[4]
-    vdup.8          d4, d26[0]
-    vdup.8          d5, d26[4]
-
-;First pass: output_height lines x output_width columns (8x8)
-filt_blk2d_fpo8x8_loop_neon
-    vld1.u8         {q3}, [r0], r1          ;load src data
-    vld1.u8         {q4}, [r0], r1
-    vld1.u8         {q5}, [r0], r1
-    vld1.u8         {q6}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q8, d8, d0
-    vmull.u8        q9, d10, d0
-    vmull.u8        q10, d12, d0
-
-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
-    vext.8          d29, d8, d9, #1
-    vext.8          d30, d10, d11, #1
-    vext.8          d31, d12, d13, #1
-
-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q8, d29, d1
-    vmlsl.u8        q9, d30, d1
-    vmlsl.u8        q10, d31, d1
-
-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
-    vext.8          d29, d8, d9, #4
-    vext.8          d30, d10, d11, #4
-    vext.8          d31, d12, d13, #4
-
-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q8, d29, d4
-    vmlsl.u8        q9, d30, d4
-    vmlsl.u8        q10, d31, d4
-
-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
-    vext.8          d29, d8, d9, #2
-    vext.8          d30, d10, d11, #2
-    vext.8          d31, d12, d13, #2
-
-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q8, d29, d2
-    vmlal.u8        q9, d30, d2
-    vmlal.u8        q10, d31, d2
-
-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
-    vext.8          d29, d8, d9, #5
-    vext.8          d30, d10, d11, #5
-    vext.8          d31, d12, d13, #5
-
-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q8, d29, d5
-    vmlal.u8        q9, d30, d5
-    vmlal.u8        q10, d31, d5
-
-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
-    vext.8          d29, d8, d9, #3
-    vext.8          d30, d10, d11, #3
-    vext.8          d31, d12, d13, #3
-
-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q4, d29, d3
-    vmull.u8        q5, d30, d3
-    vmull.u8        q6, d31, d3
- ;
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    subs            r2, r2, #1
-
-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
-    vqrshrun.s16    d23, q8, #7
-    vqrshrun.s16    d24, q9, #7
-    vqrshrun.s16    d25, q10, #7
-
-    vst1.u8         {d22}, [r4], r5         ;store result
-    vst1.u8         {d23}, [r4], r5
-    vst1.u8         {d24}, [r4], r5
-    vst1.u8         {d25}, [r4], r5
-
-    bne             filt_blk2d_fpo8x8_loop_neon
-
-    pop             {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x8_only
-    sub             r0, r0, r1, lsl #1
-    add             r3, r12, r3, lsl #5
-
-    vld1.u8         {d18}, [r0], r1         ;load src data
-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
-    vld1.u8         {d19}, [r0], r1
-    vabs.s32        q7, q5
-    vld1.u8         {d20}, [r0], r1
-    vabs.s32        q8, q6
-    vld1.u8         {d21}, [r0], r1
-    mov             r3, #2                  ;loop counter
-    vld1.u8         {d22}, [r0], r1
-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
-    vld1.u8         {d23}, [r0], r1
-    vdup.8          d1, d14[4]
-    vld1.u8         {d24}, [r0], r1
-    vdup.8          d2, d15[0]
-    vld1.u8         {d25}, [r0], r1
-    vdup.8          d3, d15[4]
-    vld1.u8         {d26}, [r0], r1
-    vdup.8          d4, d16[0]
-    vld1.u8         {d27}, [r0], r1
-    vdup.8          d5, d16[4]
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-;Second pass: 8x8
-filt_blk2d_spo8x8_loop_neon
-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
-    vmull.u8        q4, d19, d0
-    vmull.u8        q5, d20, d0
-    vmull.u8        q6, d21, d0
-
-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
-    vmlsl.u8        q4, d20, d1
-    vmlsl.u8        q5, d21, d1
-    vmlsl.u8        q6, d22, d1
-
-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
-    vmlsl.u8        q4, d23, d4
-    vmlsl.u8        q5, d24, d4
-    vmlsl.u8        q6, d25, d4
-
-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
-    vmlal.u8        q4, d21, d2
-    vmlal.u8        q5, d22, d2
-    vmlal.u8        q6, d23, d2
-
-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
-    vmlal.u8        q4, d24, d5
-    vmlal.u8        q5, d25, d5
-    vmlal.u8        q6, d26, d5
-
-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
-    vmull.u8        q8, d22, d3
-    vmull.u8        q9, d23, d3
-    vmull.u8        q10, d24, d3
-
-    subs            r3, r3, #1
-
-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
-    vqadd.s16       q8, q4
-    vqadd.s16       q9, q5
-    vqadd.s16       q10, q6
-
-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
-    vqrshrun.s16    d7, q8, #7
-    vqrshrun.s16    d8, q9, #7
-    vqrshrun.s16    d9, q10, #7
-
-    vmov            q9, q11
-    vst1.u8         {d6}, [r4], r5          ;store result
-    vmov            q10, q12
-    vst1.u8         {d7}, [r4], r5
-    vmov            q11, q13
-    vst1.u8         {d8}, [r4], r5
-    vmov            q12, q14
-    vst1.u8         {d9}, [r4], r5
-    vmov            d26, d30
-
-    bne filt_blk2d_spo8x8_loop_neon
-
-    pop             {r4-r5,pc}
-
-    ENDP
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 0000000..7a4d9e0
--- /dev/null
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1752 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+    {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */
+    {0, -6,  123,  12,  -1, 0, 0, 0},  /*    just as per alpha -0.5 bicubic */
+    {2, -11, 108,  36,  -8, 1, 0, 0},  /* New 1/4 pel 6 tap filter */
+    {0, -9,   93,  50,  -6, 0, 0, 0},
+    {3, -16,  77,  77, -16, 3, 0, 0},  /* New 1/2 pel 6 tap filter */
+    {0, -6,   50,  93,  -9, 0, 0, 0},
+    {1, -8,   36, 108, -11, 2, 0, 0},  /* New 1/4 pel 6 tap filter */
+    {0, -1,   12, 123,  -6, 0, 0, 0},
+};
+
+void vp8_sixtap_predict4x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
+    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint32x2_t d27u32, d28u32, d29u32, d30u32, d31u32;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
+    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
+    uint32x2x2_t d0u32x2, d1u32x2;
+
+    if (xoffset == 0) {  // secondpass_filter4x4_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
+        src += src_pixels_per_line;
+        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
+        src += src_pixels_per_line;
+        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
+        src += src_pixels_per_line;
+        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
+        src += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
+        src += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
+        src += src_pixels_per_line;
+        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
+        src += src_pixels_per_line;
+        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
+        src += src_pixels_per_line;
+        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
+
+        d27u8 = vreinterpret_u8_u32(d27u32);
+        d28u8 = vreinterpret_u8_u32(d28u32);
+        d29u8 = vreinterpret_u8_u32(d29u32);
+        d30u8 = vreinterpret_u8_u32(d30u32);
+        d31u8 = vreinterpret_u8_u32(d31u32);
+
+        d23u8 = vext_u8(d27u8, d28u8, 4);
+        d24u8 = vext_u8(d28u8, d29u8, 4);
+        d25u8 = vext_u8(d29u8, d30u8, 4);
+        d26u8 = vext_u8(d30u8, d31u8, 4);
+
+        q3u16 = vmull_u8(d27u8, d0u8);
+        q4u16 = vmull_u8(d28u8, d0u8);
+        q5u16 = vmull_u8(d25u8, d5u8);
+        q6u16 = vmull_u8(d26u8, d5u8);
+
+        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+
+        q5s16 = vqaddq_s16(q5s16, q3s16);
+        q6s16 = vqaddq_s16(q6s16, q4s16);
+
+        d3u8 = vqrshrun_n_s16(q5s16, 7);
+        d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+
+    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    // vswp here
+    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
+                       vreinterpret_u32_u8(d19u8));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
+                       vreinterpret_u32_u8(d21u8));
+    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+
+    // keep original src data in q4 q6
+    q4u64 = vreinterpretq_u64_u8(q3u8);
+    q6u64 = vreinterpretq_u64_u8(q5u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
+                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
+                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
+    q9u64 = vshrq_n_u64(q4u64, 8);
+    q10u64 = vshrq_n_u64(q6u64, 8);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 32);
+    q5u64 = vshrq_n_u64(q6u64, 32);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u64 = vshrq_n_u64(q4u64, 16);
+    q10u64 = vshrq_n_u64(q6u64, 16);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 24);
+    q5u64 = vshrq_n_u64(q6u64, 24);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q7s16 = vqaddq_s16(q7s16, q9s16);
+    q8s16 = vqaddq_s16(q8s16, q10s16);
+
+    d27u8 = vqrshrun_n_s16(q7s16, 7);
+    d28u8 = vqrshrun_n_s16(q8s16, 7);
+
+    if (yoffset == 0) {  // firstpass_filter4x4_only
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+        return;
+    }
+
+    // First Pass on rest 5-line data
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q11u8 = vld1q_u8(src);
+
+    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    // vswp here
+    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
+                       vreinterpret_u32_u8(d19u8));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
+                       vreinterpret_u32_u8(d21u8));
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
+    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+    q12u16 = vmull_u8(d31u8, d5u8);
+
+    q4u64 = vreinterpretq_u64_u8(q3u8);
+    q6u64 = vreinterpretq_u64_u8(q5u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
+                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
+                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
+    q9u64 = vshrq_n_u64(q4u64, 8);
+    q10u64 = vshrq_n_u64(q6u64, 8);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 32);
+    q5u64 = vshrq_n_u64(q6u64, 32);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u64 = vshrq_n_u64(q4u64, 16);
+    q10u64 = vshrq_n_u64(q6u64, 16);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 24);
+    q5u64 = vshrq_n_u64(q6u64, 24);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
+    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+    q11u16 = vmull_u8(d31u8, d3u8);
+
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+    q7s16 = vqaddq_s16(q7s16, q9s16);
+    q8s16 = vqaddq_s16(q8s16, q10s16);
+    q12s16 = vqaddq_s16(q12s16, q11s16);
+
+    d29u8 = vqrshrun_n_s16(q7s16, 7);
+    d30u8 = vqrshrun_n_s16(q8s16, 7);
+    d31u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 4x4
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    d23u8 = vext_u8(d27u8, d28u8, 4);
+    d24u8 = vext_u8(d28u8, d29u8, 4);
+    d25u8 = vext_u8(d29u8, d30u8, 4);
+    d26u8 = vext_u8(d30u8, d31u8, 4);
+
+    q3u16 = vmull_u8(d27u8, d0u8);
+    q4u16 = vmull_u8(d28u8, d0u8);
+    q5u16 = vmull_u8(d25u8, d5u8);
+    q6u16 = vmull_u8(d26u8, d5u8);
+
+    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+
+    q5s16 = vqaddq_s16(q5s16, q3s16);
+    q6s16 = vqaddq_s16(q6s16, q4s16);
+
+    d3u8 = vqrshrun_n_s16(q5s16, 7);
+    d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+    return;
+}
+
+void vp8_sixtap_predict8x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+    uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+    if (xoffset == 0) {  // secondpass_filter8x4_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d22u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d23u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d24u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d25u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d26u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d27u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d28u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d29u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d30u8 = vld1_u8(src);
+
+        q3u16 = vmull_u8(d22u8, d0u8);
+        q4u16 = vmull_u8(d23u8, d0u8);
+        q5u16 = vmull_u8(d24u8, d0u8);
+        q6u16 = vmull_u8(d25u8, d0u8);
+
+        q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+        q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+        q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+        q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+        q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+        q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+        q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+        q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+        q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+        q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+        q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+        q7u16 = vmull_u8(d25u8, d3u8);
+        q8u16 = vmull_u8(d26u8, d3u8);
+        q9u16 = vmull_u8(d27u8, d3u8);
+        q10u16 = vmull_u8(d28u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d6u8 = vqrshrun_n_s16(q7s16, 7);
+        d7u8 = vqrshrun_n_s16(q8s16, 7);
+        d8u8 = vqrshrun_n_s16(q9s16, 7);
+        d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+        vst1_u8(dst_ptr, d6u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d7u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d8u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d9u8);
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+
+    q7u16  = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q8u16  = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q9u16  = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+    q7u16  = vmlsl_u8(q7u16, d28u8, d1u8);
+    q8u16  = vmlsl_u8(q8u16, d29u8, d1u8);
+    q9u16  = vmlsl_u8(q9u16, d30u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+    q7u16  = vmlsl_u8(q7u16, d28u8, d4u8);
+    q8u16  = vmlsl_u8(q8u16, d29u8, d4u8);
+    q9u16  = vmlsl_u8(q9u16, d30u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+    q7u16  = vmlal_u8(q7u16, d28u8, d2u8);
+    q8u16  = vmlal_u8(q8u16, d29u8, d2u8);
+    q9u16  = vmlal_u8(q9u16, d30u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+    q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+    q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+    q3u16 = vmull_u8(d28u8, d3u8);
+    q4u16 = vmull_u8(d29u8, d3u8);
+    q5u16 = vmull_u8(d30u8, d3u8);
+    q6u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+
+    q7s16 = vqaddq_s16(q7s16, q3s16);
+    q8s16 = vqaddq_s16(q8s16, q4s16);
+    q9s16 = vqaddq_s16(q9s16, q5s16);
+    q10s16 = vqaddq_s16(q10s16, q6s16);
+
+    d22u8 = vqrshrun_n_s16(q7s16, 7);
+    d23u8 = vqrshrun_n_s16(q8s16, 7);
+    d24u8 = vqrshrun_n_s16(q9s16, 7);
+    d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+    if (yoffset == 0) {  // firstpass_filter8x4_only
+        vst1_u8(dst_ptr, d22u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d23u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d24u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d25u8);
+        return;
+    }
+
+    // First Pass on rest 5-line data
+    src += src_pixels_per_line;
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q7u8 = vld1q_u8(src);
+
+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+    q8u16  = vmlsl_u8(q8u16, d27u8, d1u8);
+    q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+    q8u16  = vmlsl_u8(q8u16, d27u8, d4u8);
+    q9u16  = vmlsl_u8(q9u16, d28u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+    q8u16  = vmlal_u8(q8u16, d27u8, d2u8);
+    q9u16  = vmlal_u8(q9u16, d28u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+    q8u16  = vmlal_u8(q8u16, d27u8, d5u8);
+    q9u16  = vmlal_u8(q9u16, d28u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+    q3u16 = vmull_u8(d27u8, d3u8);
+    q4u16 = vmull_u8(d28u8, d3u8);
+    q5u16 = vmull_u8(d29u8, d3u8);
+    q6u16 = vmull_u8(d30u8, d3u8);
+    q7u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+
+    q8s16 = vqaddq_s16(q8s16, q3s16);
+    q9s16 = vqaddq_s16(q9s16, q4s16);
+    q10s16 = vqaddq_s16(q10s16, q5s16);
+    q11s16 = vqaddq_s16(q11s16, q6s16);
+    q12s16 = vqaddq_s16(q12s16, q7s16);
+
+    d26u8 = vqrshrun_n_s16(q8s16, 7);
+    d27u8 = vqrshrun_n_s16(q9s16, 7);
+    d28u8 = vqrshrun_n_s16(q10s16, 7);
+    d29u8 = vqrshrun_n_s16(q11s16, 7);
+    d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 8x4
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    q3u16 = vmull_u8(d22u8, d0u8);
+    q4u16 = vmull_u8(d23u8, d0u8);
+    q5u16 = vmull_u8(d24u8, d0u8);
+    q6u16 = vmull_u8(d25u8, d0u8);
+
+    q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+    q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+    q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+    q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+    q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+    q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+    q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+    q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+    q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+    q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+    q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+    q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+    q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+    q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+    q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+    q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+    q7u16 = vmull_u8(d25u8, d3u8);
+    q8u16 = vmull_u8(d26u8, d3u8);
+    q9u16 = vmull_u8(d27u8, d3u8);
+    q10u16 = vmull_u8(d28u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+
+    q7s16 = vqaddq_s16(q7s16, q3s16);
+    q8s16 = vqaddq_s16(q8s16, q4s16);
+    q9s16 = vqaddq_s16(q9s16, q5s16);
+    q10s16 = vqaddq_s16(q10s16, q6s16);
+
+    d6u8 = vqrshrun_n_s16(q7s16, 7);
+    d7u8 = vqrshrun_n_s16(q8s16, 7);
+    d8u8 = vqrshrun_n_s16(q9s16, 7);
+    d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+    vst1_u8(dst_ptr, d6u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d7u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d8u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d9u8);
+    return;
+}
+
+void vp8_sixtap_predict8x8_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src, *tmpp;
+    unsigned char tmp[64];
+    int i;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+    if (xoffset == 0) {  // secondpass_filter8x8_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d18u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d19u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d20u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d21u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d22u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d23u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d24u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d25u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d26u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d27u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d28u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d29u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d30u8 = vld1_u8(src);
+
+        for (i = 2; i > 0; i--) {
+            q3u16 = vmull_u8(d18u8, d0u8);
+            q4u16 = vmull_u8(d19u8, d0u8);
+            q5u16 = vmull_u8(d20u8, d0u8);
+            q6u16 = vmull_u8(d21u8, d0u8);
+
+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+            q7u16 = vmull_u8(d21u8, d3u8);
+            q8u16 = vmull_u8(d22u8, d3u8);
+            q9u16 = vmull_u8(d23u8, d3u8);
+            q10u16 = vmull_u8(d24u8, d3u8);
+
+            q3s16 = vreinterpretq_s16_u16(q3u16);
+            q4s16 = vreinterpretq_s16_u16(q4u16);
+            q5s16 = vreinterpretq_s16_u16(q5u16);
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+
+            q7s16 = vqaddq_s16(q7s16, q3s16);
+            q8s16 = vqaddq_s16(q8s16, q4s16);
+            q9s16 = vqaddq_s16(q9s16, q5s16);
+            q10s16 = vqaddq_s16(q10s16, q6s16);
+
+            d6u8 = vqrshrun_n_s16(q7s16, 7);
+            d7u8 = vqrshrun_n_s16(q8s16, 7);
+            d8u8 = vqrshrun_n_s16(q9s16, 7);
+            d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+            d18u8 = d22u8;
+            d19u8 = d23u8;
+            d20u8 = d24u8;
+            d21u8 = d25u8;
+            d22u8 = d26u8;
+            d23u8 = d27u8;
+            d24u8 = d28u8;
+            d25u8 = d29u8;
+            d26u8 = d30u8;
+
+            vst1_u8(dst_ptr, d6u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d7u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d8u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d9u8);
+            dst_ptr += dst_pitch;
+        }
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+    tmpp = tmp;
+    for (i = 2; i > 0; i--) {
+        q3u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q4u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q5u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q6u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+
+        __builtin_prefetch(src);
+        __builtin_prefetch(src + src_pixels_per_line);
+        __builtin_prefetch(src + src_pixels_per_line * 2);
+
+        q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+        q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+        q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+        q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+        q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+        q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+        q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+        q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+        q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+        q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+        q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+        q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+        q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+        q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+        q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+        q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+        q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+        q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+        q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+        q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+        q3u16 = vmull_u8(d28u8, d3u8);
+        q4u16 = vmull_u8(d29u8, d3u8);
+        q5u16 = vmull_u8(d30u8, d3u8);
+        q6u16 = vmull_u8(d31u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d22u8 = vqrshrun_n_s16(q7s16, 7);
+        d23u8 = vqrshrun_n_s16(q8s16, 7);
+        d24u8 = vqrshrun_n_s16(q9s16, 7);
+        d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+        if (yoffset == 0) {  // firstpass_filter8x4_only
+            vst1_u8(dst_ptr, d22u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d23u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d24u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d25u8);
+            dst_ptr += dst_pitch;
+        } else {
+            vst1_u8(tmpp, d22u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d23u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d24u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d25u8);
+            tmpp += 8;
+        }
+    }
+    if (yoffset == 0)
+        return;
+
+    // First Pass on rest 5-line data
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q7u8 = vld1q_u8(src);
+
+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+    q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+    q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+    q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+    q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+    q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+    q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+    q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+    q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+    q3u16 = vmull_u8(d27u8, d3u8);
+    q4u16 = vmull_u8(d28u8, d3u8);
+    q5u16 = vmull_u8(d29u8, d3u8);
+    q6u16 = vmull_u8(d30u8, d3u8);
+    q7u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+
+    q8s16 = vqaddq_s16(q8s16, q3s16);
+    q9s16 = vqaddq_s16(q9s16, q4s16);
+    q10s16 = vqaddq_s16(q10s16, q5s16);
+    q11s16 = vqaddq_s16(q11s16, q6s16);
+    q12s16 = vqaddq_s16(q12s16, q7s16);
+
+    d26u8 = vqrshrun_n_s16(q8s16, 7);
+    d27u8 = vqrshrun_n_s16(q9s16, 7);
+    d28u8 = vqrshrun_n_s16(q10s16, 7);
+    d29u8 = vqrshrun_n_s16(q11s16, 7);
+    d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 8x8
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    tmpp = tmp;
+    q9u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q10u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q11u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q12u8 = vld1q_u8(tmpp);
+
+    d18u8 = vget_low_u8(q9u8);
+    d19u8 = vget_high_u8(q9u8);
+    d20u8 = vget_low_u8(q10u8);
+    d21u8 = vget_high_u8(q10u8);
+    d22u8 = vget_low_u8(q11u8);
+    d23u8 = vget_high_u8(q11u8);
+    d24u8 = vget_low_u8(q12u8);
+    d25u8 = vget_high_u8(q12u8);
+
+    for (i = 2; i > 0; i--) {
+        q3u16 = vmull_u8(d18u8, d0u8);
+        q4u16 = vmull_u8(d19u8, d0u8);
+        q5u16 = vmull_u8(d20u8, d0u8);
+        q6u16 = vmull_u8(d21u8, d0u8);
+
+        q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+        q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+        q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+        q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+        q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+        q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+        q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+        q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+        q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+        q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+        q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+        q7u16 = vmull_u8(d21u8, d3u8);
+        q8u16 = vmull_u8(d22u8, d3u8);
+        q9u16 = vmull_u8(d23u8, d3u8);
+        q10u16 = vmull_u8(d24u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d6u8 = vqrshrun_n_s16(q7s16, 7);
+        d7u8 = vqrshrun_n_s16(q8s16, 7);
+        d8u8 = vqrshrun_n_s16(q9s16, 7);
+        d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+        d18u8 = d22u8;
+        d19u8 = d23u8;
+        d20u8 = d24u8;
+        d21u8 = d25u8;
+        d22u8 = d26u8;
+        d23u8 = d27u8;
+        d24u8 = d28u8;
+        d25u8 = d29u8;
+        d26u8 = d30u8;
+
+        vst1_u8(dst_ptr, d6u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d7u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d8u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d9u8);
+        dst_ptr += dst_pitch;
+    }
+    return;
+}
+
+void vp8_sixtap_predict16x16_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src, *src_tmp, *dst, *tmpp;
+    unsigned char tmp[336];
+    int i, j;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+    uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+    uint8x8_t d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint8x16_t q3u8, q4u8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+    uint16x8_t q11u16, q12u16, q13u16, q15u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+    int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+    if (xoffset == 0) {  // secondpass_filter8x8_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src_tmp = src_ptr - src_pixels_per_line * 2;
+        for (i = 0; i < 2; i++) {
+            src = src_tmp + i * 8;
+            dst = dst_ptr + i * 8;
+            d18u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d19u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d20u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d21u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d22u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            for (j = 0; j < 4; j++) {
+                d23u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d24u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d25u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d26u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+
+                q3u16 = vmull_u8(d18u8, d0u8);
+                q4u16 = vmull_u8(d19u8, d0u8);
+                q5u16 = vmull_u8(d20u8, d0u8);
+                q6u16 = vmull_u8(d21u8, d0u8);
+
+                q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+                q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+                q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+                q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+                q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+                q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+                q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+                q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+                q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+                q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+                q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+                q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+                q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+                q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+                q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+                q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+                q7u16 = vmull_u8(d21u8, d3u8);
+                q8u16 = vmull_u8(d22u8, d3u8);
+                q9u16 = vmull_u8(d23u8, d3u8);
+                q10u16 = vmull_u8(d24u8, d3u8);
+
+                q3s16 = vreinterpretq_s16_u16(q3u16);
+                q4s16 = vreinterpretq_s16_u16(q4u16);
+                q5s16 = vreinterpretq_s16_u16(q5u16);
+                q6s16 = vreinterpretq_s16_u16(q6u16);
+                q7s16 = vreinterpretq_s16_u16(q7u16);
+                q8s16 = vreinterpretq_s16_u16(q8u16);
+                q9s16 = vreinterpretq_s16_u16(q9u16);
+                q10s16 = vreinterpretq_s16_u16(q10u16);
+
+                q7s16 = vqaddq_s16(q7s16, q3s16);
+                q8s16 = vqaddq_s16(q8s16, q4s16);
+                q9s16 = vqaddq_s16(q9s16, q5s16);
+                q10s16 = vqaddq_s16(q10s16, q6s16);
+
+                d6u8 = vqrshrun_n_s16(q7s16, 7);
+                d7u8 = vqrshrun_n_s16(q8s16, 7);
+                d8u8 = vqrshrun_n_s16(q9s16, 7);
+                d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+                d18u8 = d22u8;
+                d19u8 = d23u8;
+                d20u8 = d24u8;
+                d21u8 = d25u8;
+                d22u8 = d26u8;
+
+                vst1_u8(dst, d6u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d7u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d8u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d9u8);
+                dst += dst_pitch;
+            }
+        }
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0) {  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+        dst = dst_ptr;
+        for (i = 0; i < 8; i++) {
+            d6u8 = vld1_u8(src);
+            d7u8 = vld1_u8(src + 8);
+            d8u8 = vld1_u8(src + 16);
+            src += src_pixels_per_line;
+            d9u8 = vld1_u8(src);
+            d10u8 = vld1_u8(src + 8);
+            d11u8 = vld1_u8(src + 16);
+            src += src_pixels_per_line;
+
+            __builtin_prefetch(src);
+            __builtin_prefetch(src + src_pixels_per_line);
+
+            q6u16 = vmull_u8(d6u8, d0u8);
+            q7u16 = vmull_u8(d7u8, d0u8);
+            q8u16 = vmull_u8(d9u8, d0u8);
+            q9u16 = vmull_u8(d10u8, d0u8);
+
+            d20u8 = vext_u8(d6u8, d7u8, 1);
+            d21u8 = vext_u8(d9u8, d10u8, 1);
+            d22u8 = vext_u8(d7u8, d8u8, 1);
+            d23u8 = vext_u8(d10u8, d11u8, 1);
+            d24u8 = vext_u8(d6u8, d7u8, 4);
+            d25u8 = vext_u8(d9u8, d10u8, 4);
+            d26u8 = vext_u8(d7u8, d8u8, 4);
+            d27u8 = vext_u8(d10u8, d11u8, 4);
+            d28u8 = vext_u8(d6u8, d7u8, 5);
+            d29u8 = vext_u8(d9u8, d10u8, 5);
+
+            q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+            q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+            q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+            q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+            q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+            q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+            q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+            q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+            q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+            d20u8 = vext_u8(d7u8, d8u8, 5);
+            d21u8 = vext_u8(d10u8, d11u8, 5);
+            d22u8 = vext_u8(d6u8, d7u8, 2);
+            d23u8 = vext_u8(d9u8, d10u8, 2);
+            d24u8 = vext_u8(d7u8, d8u8, 2);
+            d25u8 = vext_u8(d10u8, d11u8, 2);
+            d26u8 = vext_u8(d6u8, d7u8, 3);
+            d27u8 = vext_u8(d9u8, d10u8, 3);
+            d28u8 = vext_u8(d7u8, d8u8, 3);
+            d29u8 = vext_u8(d10u8, d11u8, 3);
+
+            q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+            q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+            q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+            q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+            q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+            q10u16 = vmull_u8(d26u8, d3u8);
+            q11u16 = vmull_u8(d27u8, d3u8);
+            q12u16 = vmull_u8(d28u8, d3u8);
+            q15u16 = vmull_u8(d29u8, d3u8);
+
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+            q11s16 = vreinterpretq_s16_u16(q11u16);
+            q12s16 = vreinterpretq_s16_u16(q12u16);
+            q15s16 = vreinterpretq_s16_u16(q15u16);
+
+            q6s16 = vqaddq_s16(q6s16, q10s16);
+            q8s16 = vqaddq_s16(q8s16, q11s16);
+            q7s16 = vqaddq_s16(q7s16, q12s16);
+            q9s16 = vqaddq_s16(q9s16, q15s16);
+
+            d6u8 = vqrshrun_n_s16(q6s16, 7);
+            d7u8 = vqrshrun_n_s16(q7s16, 7);
+            d8u8 = vqrshrun_n_s16(q8s16, 7);
+            d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+            q3u8 = vcombine_u8(d6u8, d7u8);
+            q4u8 = vcombine_u8(d8u8, d9u8);
+            vst1q_u8(dst, q3u8);
+            dst += dst_pitch;
+            vst1q_u8(dst, q4u8);
+            dst += dst_pitch;
+        }
+        return;
+    }
+
+    src = src_ptr - 2 - src_pixels_per_line * 2;
+    tmpp = tmp;
+    for (i = 0; i < 7; i++) {
+        d6u8 = vld1_u8(src);
+        d7u8 = vld1_u8(src + 8);
+        d8u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+        d9u8 = vld1_u8(src);
+        d10u8 = vld1_u8(src + 8);
+        d11u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+        d12u8 = vld1_u8(src);
+        d13u8 = vld1_u8(src + 8);
+        d14u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+
+        __builtin_prefetch(src);
+        __builtin_prefetch(src + src_pixels_per_line);
+        __builtin_prefetch(src + src_pixels_per_line * 2);
+
+        q8u16 = vmull_u8(d6u8, d0u8);
+        q9u16 = vmull_u8(d7u8, d0u8);
+        q10u16 = vmull_u8(d9u8, d0u8);
+        q11u16 = vmull_u8(d10u8, d0u8);
+        q12u16 = vmull_u8(d12u8, d0u8);
+        q13u16 = vmull_u8(d13u8, d0u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 1);
+        d29u8 = vext_u8(d9u8, d10u8, 1);
+        d30u8 = vext_u8(d12u8, d13u8, 1);
+        q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+        q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+        q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+        d28u8 = vext_u8(d7u8, d8u8, 1);
+        d29u8 = vext_u8(d10u8, d11u8, 1);
+        d30u8 = vext_u8(d13u8, d14u8, 1);
+        q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
+        q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+        q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 4);
+        d29u8 = vext_u8(d9u8, d10u8, 4);
+        d30u8 = vext_u8(d12u8, d13u8, 4);
+        q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+        q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+        q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+        d28u8 = vext_u8(d7u8, d8u8, 4);
+        d29u8 = vext_u8(d10u8, d11u8, 4);
+        d30u8 = vext_u8(d13u8, d14u8, 4);
+        q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+        q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+        q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 5);
+        d29u8 = vext_u8(d9u8, d10u8, 5);
+        d30u8 = vext_u8(d12u8, d13u8, 5);
+        q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+        q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+        q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+        d28u8 = vext_u8(d7u8, d8u8, 5);
+        d29u8 = vext_u8(d10u8, d11u8, 5);
+        d30u8 = vext_u8(d13u8, d14u8, 5);
+        q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+        q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+        q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 2);
+        d29u8 = vext_u8(d9u8, d10u8, 2);
+        d30u8 = vext_u8(d12u8, d13u8, 2);
+        q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+        q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+        q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+        d28u8 = vext_u8(d7u8, d8u8, 2);
+        d29u8 = vext_u8(d10u8, d11u8, 2);
+        d30u8 = vext_u8(d13u8, d14u8, 2);
+        q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+        q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+        q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 3);
+        d29u8 = vext_u8(d9u8, d10u8, 3);
+        d30u8 = vext_u8(d12u8, d13u8, 3);
+        d15u8 = vext_u8(d7u8, d8u8, 3);
+        d31u8 = vext_u8(d10u8, d11u8, 3);
+        d6u8  = vext_u8(d13u8, d14u8, 3);
+        q4u16 = vmull_u8(d28u8, d3u8);
+        q5u16 = vmull_u8(d29u8, d3u8);
+        q6u16 = vmull_u8(d30u8, d3u8);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+        q12s16 = vreinterpretq_s16_u16(q12u16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q10s16 = vqaddq_s16(q10s16, q5s16);
+        q12s16 = vqaddq_s16(q12s16, q6s16);
+
+        q6u16 = vmull_u8(d15u8, d3u8);
+        q7u16 = vmull_u8(d31u8, d3u8);
+        q3u16 = vmull_u8(d6u8, d3u8);
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q11s16 = vreinterpretq_s16_u16(q11u16);
+        q13s16 = vreinterpretq_s16_u16(q13u16);
+        q9s16 = vqaddq_s16(q9s16, q6s16);
+        q11s16 = vqaddq_s16(q11s16, q7s16);
+        q13s16 = vqaddq_s16(q13s16, q3s16);
+
+        d6u8 = vqrshrun_n_s16(q8s16, 7);
+        d7u8 = vqrshrun_n_s16(q9s16, 7);
+        d8u8 = vqrshrun_n_s16(q10s16, 7);
+        d9u8 = vqrshrun_n_s16(q11s16, 7);
+        d10u8 = vqrshrun_n_s16(q12s16, 7);
+        d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+        vst1_u8(tmpp, d6u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d7u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d8u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d9u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d10u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d11u8);
+        tmpp += 8;
+    }
+
+    // Second pass: 16x16
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    for (i = 0; i < 2; i++) {
+        dst = dst_ptr + 8 * i;
+        tmpp = tmp + 8 * i;
+        d18u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d19u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d20u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d21u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d22u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        for (j = 0; j < 4; j++) {
+            d23u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d24u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d25u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d26u8 = vld1_u8(tmpp);
+            tmpp += 16;
+
+            q3u16 = vmull_u8(d18u8, d0u8);
+            q4u16 = vmull_u8(d19u8, d0u8);
+            q5u16 = vmull_u8(d20u8, d0u8);
+            q6u16 = vmull_u8(d21u8, d0u8);
+
+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+            q7u16 = vmull_u8(d21u8, d3u8);
+            q8u16 = vmull_u8(d22u8, d3u8);
+            q9u16 = vmull_u8(d23u8, d3u8);
+            q10u16 = vmull_u8(d24u8, d3u8);
+
+            q3s16 = vreinterpretq_s16_u16(q3u16);
+            q4s16 = vreinterpretq_s16_u16(q4u16);
+            q5s16 = vreinterpretq_s16_u16(q5u16);
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+
+            q7s16 = vqaddq_s16(q7s16, q3s16);
+            q8s16 = vqaddq_s16(q8s16, q4s16);
+            q9s16 = vqaddq_s16(q9s16, q5s16);
+            q10s16 = vqaddq_s16(q10s16, q6s16);
+
+            d6u8 = vqrshrun_n_s16(q7s16, 7);
+            d7u8 = vqrshrun_n_s16(q8s16, 7);
+            d8u8 = vqrshrun_n_s16(q9s16, 7);
+            d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+            d18u8 = d22u8;
+            d19u8 = d23u8;
+            d20u8 = d24u8;
+            d21u8 = d25u8;
+            d22u8 = d26u8;
+
+            vst1_u8(dst, d6u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d7u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d8u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d9u8);
+            dst += dst_pitch;
+        }
+    }
+    return;
+}
diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm
deleted file mode 100644
index e3b4832..0000000
--- a/vp8/common/arm/neon/variance_neon.asm
+++ /dev/null
@@ -1,276 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance16x16_neon|
-    EXPORT  |vp8_variance16x8_neon|
-    EXPORT  |vp8_variance8x16_neon|
-    EXPORT  |vp8_variance8x8_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-variance16x16_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
-    ;the results into the elements of the destination vector. The explanation
-    ;in ARM guide is wrong.
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance16x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
-    ;vmov.32        r1, d1[0]
-    ;mul            r0, r0, r0
-    ;str            r1, [r12]
-    ;sub            r0, r1, r0, lsr #8
-
-    ; while sum is signed, sum * sum is always positive and must be treated as
-    ; unsigned to avoid propagating the sign bit.
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;================================
-;unsigned int vp8_variance16x8_c(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;   unsigned int *sse)
-|vp8_variance16x8_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #4
-
-variance16x8_neon_loop
-    vld1.8          {q0}, [r0], r1              ;Load up source and reference
-    vld1.8          {q2}, [r2], r3
-    vld1.8          {q1}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance16x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.u32        d10, d10, #7
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;=================================
-;unsigned int vp8_variance8x16_c(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;   unsigned int *sse)
-
-|vp8_variance8x16_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #8
-
-variance8x16_neon_loop
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d2, d6
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-
-    bne             variance8x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.u32        d10, d10, #7
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-;==================================
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_neon| PROC
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #2
-
-variance8x8_neon_loop
-    vld1.8          {d0}, [r0], r1              ;Load up source and reference
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d1}, [r0], r1
-    vld1.8          {d5}, [r2], r3
-    vld1.8          {d2}, [r0], r1
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d3}, [r0], r1
-    vld1.8          {d7}, [r2], r3
-
-    vsubl.u8        q11, d0, d4                 ;calculate diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;calculate sum
-    vmlal.s16       q9, d22, d22                ;calculate sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             variance8x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    ldr             r12, [sp]                   ;load *sse from stack
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r12]              ;store sse
-    vshr.u32        d10, d10, #6
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c
new file mode 100644
index 0000000..afd2dc3
--- /dev/null
+++ b/vp8/common/arm/neon/variance_neon.c
@@ -0,0 +1,323 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+unsigned int vp8_variance16x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance16x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance8x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d2u8, d4u8, d6u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d2u8, d6u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance8x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d1u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d3u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d5u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d7u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d1u8, d5u8);
+        q13u16 = vsubl_u8(d2u8, d6u8);
+        q14u16 = vsubl_u8(d3u8, d7u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
index 9d22c52..adc5b7e 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -31,11 +31,12 @@
 
 |vp8_sub_pixel_variance16x16_neon_func| PROC
     push            {r4-r6, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #24]           ;load *sse from stack
+    ldr             r4, [sp, #80]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #84]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #88]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16_only
@@ -416,6 +417,7 @@
     add             sp, sp, #528
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4-r6,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 155be4f..b0829af 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -31,9 +31,10 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_h_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]            ;load *sse from stack
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -116,6 +117,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -131,11 +134,12 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_v_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                     ;loop counter
 
     vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #4]                ;load *sse from stack
+    ldr             lr, [sp, #68]               ;load *sse from stack
 
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
@@ -212,6 +216,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -227,10 +233,11 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_hv_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
 
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]           ;load *sse from stack
     vmov.i8         q13, #0                      ;q8 - sum
     vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
 
@@ -331,6 +338,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -349,10 +358,11 @@
 
 |vp8_sub_pixel_variance16x16s_neon| PROC
     push            {r4, lr}
+    vpush           {d8-d15}
 
-    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
-    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #16]           ;load *sse from stack
+    ldr             r4, [sp, #72]           ;load *dst_ptr from stack
+    ldr             r12, [sp, #76]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #80]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16s_only
@@ -566,6 +576,7 @@
     add             sp, sp, #256
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4, pc}
     ENDP
 
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
index f6b6847..9d9f9e0 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -26,11 +26,12 @@
 
 |vp8_sub_pixel_variance8x8_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #20]           ;load *sse from stack
+    ldr             r4, [sp, #76]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #80]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #84]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             skip_firstpass_filter
@@ -210,6 +211,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {r4-r5, pc}
 
     ENDP
diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
index 2874896..765fc3a 100644
--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c
@@ -14,7 +14,7 @@
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-#if HAVE_NEON
+#if HAVE_NEON_ARM
 extern void vp8_build_intra_predictors_mby_neon_func(
     unsigned char *y_buffer,
     unsigned char *ypred_ptr,
diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index 467a509..e3f7083 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@
 #endif /* HAVE_MEDIA */
 
 
-#if HAVE_NEON
+#if HAVE_NEON_ASM
 
 extern unsigned int vp8_sub_pixel_variance16x16_neon_func
 (
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 7d0fbf6..8e546d5 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -310,6 +310,7 @@
     }
 }
 
+#if CONFIG_POSTPROC
 static void vp8_de_mblock(YV12_BUFFER_CONFIG         *post,
                           int                         q)
 {
@@ -382,6 +383,7 @@
         vp8_yv12_copy_frame(source, post);
     }
 }
+#endif
 
 #if !(CONFIG_TEMPORAL_DENOISING)
 void vp8_de_noise(VP8_COMMON                 *cm,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 130d965..dcf5b8e 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -29,8 +29,9 @@
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx media neon/;
+specialize qw/vp8_dequantize_b mmx media neon_asm/;
 $vp8_dequantize_b_media=vp8_dequantize_b_v6;
+$vp8_dequantize_b_neon_asm=vp8_dequantize_b_neon;
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
 specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
@@ -38,70 +39,76 @@
 $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
 $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
+$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
 $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
+$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon_asm dspr2/;
 $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
+$vp8_loop_filter_mbv_neon_asm=vp8_loop_filter_mbv_neon;
 $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
 $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
+$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
 $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon_asm dspr2/;
 $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
+$vp8_loop_filter_mbh_neon_asm=vp8_loop_filter_mbh_neon;
 $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
 $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
+$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
 $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
-$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon_asm/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
-$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_neon_asm=vp8_loop_filter_mbhs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
-$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon_asm/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
-$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_neon_asm=vp8_loop_filter_bhs_neon;
 
 #
 # IDCT
@@ -269,9 +276,10 @@
 $vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
 
 add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon/;
+specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon_asm/;
 $vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
 $vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
+$vp8_sub_pixel_variance8x8_neon_asm=vp8_sub_pixel_variance8x8_neon;
 
 add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
 specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
@@ -282,24 +290,28 @@
 $vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
 
 add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon/;
+specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
 $vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
 $vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
+$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
+specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon_asm/;
 $vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
 $vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
+$vp8_variance_halfpixvar16x16_h_neon_asm=vp8_variance_halfpixvar16x16_h_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
+specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon_asm/;
 $vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
 $vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
+$vp8_variance_halfpixvar16x16_v_neon_asm=vp8_variance_halfpixvar16x16_v_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
+specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon_asm/;
 $vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
 $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
+$vp8_variance_halfpixvar16x16_hv_neon_asm=vp8_variance_halfpixvar16x16_hv_neon;
 
 #
 # Single block SAD
@@ -402,12 +414,14 @@
 $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
 
 add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_mse16x16 mmx sse2 media neon/;
+specialize qw/vp8_mse16x16 mmx sse2 media neon_asm/;
 $vp8_mse16x16_sse2=vp8_mse16x16_wmt;
 $vp8_mse16x16_media=vp8_mse16x16_armv6;
+$vp8_mse16x16_neon_asm=vp8_mse16x16_neon;
 
 add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-specialize qw/vp8_get4x4sse_cs mmx neon/;
+specialize qw/vp8_get4x4sse_cs mmx neon_asm/;
+$vp8_get4x4sse_cs_neon_asm=vp8_get4x4sse_cs_neon;
 
 #
 # Block copy
@@ -434,16 +448,19 @@
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
 $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
+$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
 $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
+$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon/;
+specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
 $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
+$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 
 #
 # Quantizer
@@ -454,14 +471,16 @@
 #$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
 $vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
+$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
 
 add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
 # no asm yet
 
 add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-specialize qw/vp8_fast_quantize_b_pair neon/;
+specialize qw/vp8_fast_quantize_b_pair neon_asm/;
+$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
 
 add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
 specialize qw/vp8_quantize_mb neon/;
@@ -488,16 +507,19 @@
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
 add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon/;
+specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
 $vp8_subtract_b_media=vp8_subtract_b_armv6;
+$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
 
 add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon/;
+specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
 $vp8_subtract_mby_media=vp8_subtract_mby_armv6;
+$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
 
 add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
 $vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
+$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
 
 #
 # Motion search
@@ -526,7 +548,8 @@
 # Pick Loopfilter
 #
 add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon/;
+specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
+$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
 
 #
 # Denoiser filter
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 2d9e343..29fea61 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -178,12 +178,6 @@
    return pbi->common.error.error_code;
 }
 
-/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
-#if HAVE_NEON
-extern void vp8_push_neon(int64_t *store);
-extern void vp8_pop_neon(int64_t *store);
-#endif
-
 static int get_free_fb (VP8_COMMON *cm)
 {
     int i;
@@ -307,9 +301,6 @@
                                   const uint8_t *source,
                                   int64_t time_stamp)
 {
-#if HAVE_NEON
-    int64_t dx_store_reg[8];
-#endif
     VP8_COMMON *cm = &pbi->common;
     int retcode = -1;
 
@@ -319,15 +310,6 @@
     if(retcode <= 0)
         return retcode;
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_push_neon(dx_store_reg);
-    }
-#endif
-
     cm->new_fb_idx = get_free_fb (cm);
 
     /* setup reference frames for vp8_decode_frame */
@@ -403,15 +385,6 @@
     pbi->last_time_stamp = time_stamp;
 
 decode_exit:
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_pop_neon(dx_store_reg);
-    }
-#endif
-
     pbi->common.error.setjmp = 0;
     return retcode;
 }
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index 3f85397..23dc0a9 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -68,14 +68,11 @@
     int            mc_running_avg_y_stride = mc_running_avg->y_stride;
     unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
     int            running_avg_y_stride = running_avg->y_stride;
+    int64x2_t v_sum_diff_total = vdupq_n_s64(0);
 
     /* Go over lines. */
     int i;
-    int sum_diff = 0;
     for (i = 0; i < 16; ++i) {
-        int8x16_t v_sum_diff = vdupq_n_s8(0);
-        uint8x16_t v_running_avg_y;
-
         /* Load inputs. */
         const uint8x16_t v_sig = vld1q_u8(sig);
         const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
@@ -117,12 +114,9 @@
                                                      v_abs_adjustment);
         const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
                                                      v_abs_adjustment);
-        v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+
+        uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
         v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
-        v_sum_diff = vqaddq_s8(v_sum_diff,
-                               vreinterpretq_s8_u8(v_pos_adjustment));
-        v_sum_diff = vqsubq_s8(v_sum_diff,
-                               vreinterpretq_s8_u8(v_neg_adjustment));
 
         /* Store results. */
         vst1q_u8(running_avg_y, v_running_avg_y);
@@ -131,23 +125,19 @@
          * for this macroblock.
          */
         {
-            int s0 = vgetq_lane_s8(v_sum_diff,  0) +
-                     vgetq_lane_s8(v_sum_diff,  1) +
-                     vgetq_lane_s8(v_sum_diff,  2) +
-                     vgetq_lane_s8(v_sum_diff,  3);
-            int s1 = vgetq_lane_s8(v_sum_diff,  4) +
-                     vgetq_lane_s8(v_sum_diff,  5) +
-                     vgetq_lane_s8(v_sum_diff,  6) +
-                     vgetq_lane_s8(v_sum_diff,  7);
-            int s2 = vgetq_lane_s8(v_sum_diff,  8) +
-                     vgetq_lane_s8(v_sum_diff,  9) +
-                     vgetq_lane_s8(v_sum_diff, 10) +
-                     vgetq_lane_s8(v_sum_diff, 11);
-            int s3 = vgetq_lane_s8(v_sum_diff, 12) +
-                     vgetq_lane_s8(v_sum_diff, 13) +
-                     vgetq_lane_s8(v_sum_diff, 14) +
-                     vgetq_lane_s8(v_sum_diff, 15);
-            sum_diff += s0 + s1+ s2 + s3;
+            const int8x16_t v_sum_diff =
+                vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                          vreinterpretq_s8_u8(v_neg_adjustment));
+
+            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+            const int32x4_t fedc_ba98_7654_3210 =
+                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+            const int64x2_t fedcba98_76543210 =
+                vpaddlq_s32(fedc_ba98_7654_3210);
+
+            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
         }
 
         /* Update pointers for next iteration. */
@@ -157,11 +147,20 @@
     }
 
     /* Too much adjustments => copy block. */
-    if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
-        return COPY_BLOCK;
+    {
+        const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                                      vget_low_s64(v_sum_diff_total));
+        const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+        if (s0 > SUM_DIFF_THRESHOLD)
+            return COPY_BLOCK;
+    }
 
     /* Tell above level that block was filtered. */
-    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride,
-                      signal->thismb, sig_stride);
+    running_avg_y -= running_avg_y_stride * 16;
+    sig -= sig_stride * 16;
+
+    vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
+
     return FILTER_BLOCK;
 }
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 5bda786..840cb33 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -65,8 +65,10 @@
 ;                           unsigned char *pred, int pred_stride)
 |vp8_subtract_mby_neon| PROC
     push            {r4-r7}
+    vpush           {d8-d15}
+
     mov             r12, #4
-    ldr             r4, [sp, #16]           ; pred_stride
+    ldr             r4, [sp, #80]           ; pred_stride
     mov             r6, #32                 ; "diff" stride x2
     add             r5, r0, #16             ; second diff pointer
 
@@ -101,6 +103,7 @@
     subs            r12, r12, #1
     bne             subtract_mby_loop
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
     ENDP
@@ -112,9 +115,11 @@
 
 |vp8_subtract_mbuv_neon| PROC
     push            {r4-r7}
-    ldr             r4, [sp, #16]       ; upred
-    ldr             r5, [sp, #20]       ; vpred
-    ldr             r6, [sp, #24]       ; pred_stride
+    vpush           {d8-d15}
+
+    ldr             r4, [sp, #80]       ; upred
+    ldr             r5, [sp, #84]       ; vpred
+    ldr             r6, [sp, #88]       ; pred_stride
     add             r0, r0, #512        ; short *udiff = diff + 256;
     mov             r12, #32            ; "diff" stride x2
     add             r7, r0, #16         ; second diff pointer
@@ -191,6 +196,7 @@
     vst1.16         {q14}, [r0], r12
     vst1.16         {q15}, [r7], r12
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
 
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
index 5b9f11e..d219e2d 100644
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -21,6 +21,7 @@
 ;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
 ;                             int sz);
 |vp8_memcpy_partial_neon| PROC
+    vpush               {d8-d15}
     ;pld                [r1]                        ;preload pred data
     ;pld                [r1, #128]
     ;pld                [r1, #256]
@@ -64,6 +65,7 @@
     bne             extra_copy_neon_loop
 
 done_copy_neon_loop
+    vpop            {d8-d15}
     bx              lr
     ENDP
 
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
index 55edbf5..f82af3e 100644
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -27,6 +27,8 @@
 ;from vp8_variance().
 
 |vp8_mse16x16_neon| PROC
+    vpush           {q7}
+
     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
     vmov.i8         q8, #0
     vmov.i8         q9, #0
@@ -62,7 +64,7 @@
     vadd.u32        q7, q7, q8
     vadd.u32        q9, q9, q10
 
-    ldr             r12, [sp]               ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vadd.u32        q10, q7, q9
     vpaddl.u32      q1, q10
@@ -71,6 +73,7 @@
     vst1.32         {d0[0]}, [r12]
     vmov.32         r0, d0[0]
 
+    vpop            {q7}
     bx              lr
 
     ENDP
@@ -82,6 +85,8 @@
 ; r2    unsigned char *ref_ptr,
 ; r3    int  recon_stride
 |vp8_get4x4sse_cs_neon| PROC
+    vpush           {q7}
+
     vld1.8          {d0}, [r0], r1              ;Load up source and reference
     vld1.8          {d4}, [r2], r3
     vld1.8          {d1}, [r0], r1
@@ -109,6 +114,8 @@
     vadd.u64        d0, d2, d3
 
     vmov.32         r0, d0[0]
+
+    vpop            {q7}
     bx              lr
 
     ENDP
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 32c5997..e95e44f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -4820,33 +4820,11 @@
 }
 #endif
 
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-#if HAVE_NEON
-extern void vp8_push_neon(int64_t *store);
-extern void vp8_pop_neon(int64_t *store);
-#endif
-
-
 int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
 {
-#if HAVE_NEON
-    int64_t store_reg[8];
-#if CONFIG_RUNTIME_CPU_DETECT
-    VP8_COMMON            *cm = &cpi->common;
-#endif
-#endif
     struct vpx_usec_timer  timer;
     int                    res = 0;
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_push_neon(store_reg);
-    }
-#endif
-
     vpx_usec_timer_start(&timer);
 
     /* Reinit the lookahead buffer if the frame size changes */
@@ -4863,15 +4841,6 @@
     vpx_usec_timer_mark(&timer);
     cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_pop_neon(store_reg);
-    }
-#endif
-
     return res;
 }
 
@@ -4892,9 +4861,6 @@
 
 int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
 {
-#if HAVE_NEON
-    int64_t store_reg[8];
-#endif
     VP8_COMMON *cm;
     struct vpx_usec_timer  tsctimer;
     struct vpx_usec_timer  ticktimer;
@@ -4914,15 +4880,6 @@
 
     cpi->common.error.setjmp = 1;
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_push_neon(store_reg);
-    }
-#endif
-
     vpx_usec_timer_start(&cmptimer);
 
     cpi->source = NULL;
@@ -5005,14 +4962,6 @@
 
 #endif
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->cpu_caps & HAS_NEON)
-#endif
-        {
-            vp8_pop_neon(store_reg);
-        }
-#endif
         return -1;
     }
 
@@ -5278,7 +5227,7 @@
                 int y_samples = orig->y_height * orig->y_width ;
                 int uv_samples = orig->uv_height * orig->uv_width ;
                 int t_samples = y_samples + 2 * uv_samples;
-                double sq_error, sq_error2;
+                double sq_error;
 
                 ye = calc_plane_error(orig->y_buffer, orig->y_stride,
                   recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
@@ -5301,6 +5250,7 @@
 #if CONFIG_POSTPROC
                 {
                     YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                    double sq_error2;
                     double frame_psnr2, frame_ssim2 = 0;
                     double weight = 0;
 
@@ -5416,15 +5366,6 @@
 #endif
 #endif
 
-#if HAVE_NEON
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (cm->cpu_caps & HAS_NEON)
-#endif
-    {
-        vp8_pop_neon(store_reg);
-    }
-#endif
-
     cpi->common.error.setjmp = 0;
 
     return 0;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 513b2bf..4dc0d95 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -98,6 +98,7 @@
     unsigned int i, j, k;
     int modifier;
     int byte = 0;
+    const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
 
     for (i = 0,k = 0; i < block_size; i++)
     {
@@ -114,7 +115,7 @@
              */
             modifier  *= modifier;
             modifier  *= 3;
-            modifier  += 1 << (strength - 1);
+            modifier  += rounding;
             modifier >>= strength;
 
             if (modifier > 16)
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
deleted file mode 100644
index 7b1dc11..0000000
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ /dev/null
@@ -1,138 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp8_fast_quantize_b_ssse3) PRIVATE
-sym(vp8_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_round]
-    mov         rdx, [rdi + vp8_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp8_blockd_qcoeff]
-    mov         rdi, [rsi + vp8_blockd_dequant]
-    mov         rcx, [rsi + vp8_blockd_dqcoeff]
-
-    movdqa      xmm2, xmm1                  ;store y for getting eob
-    movdqa      xmm3, xmm5
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm0, [rdi]
-    movdqa      xmm4, [rdi + 16]
-
-    pmullw      xmm0, xmm1
-    pmullw      xmm4, xmm5
-    pxor        xmm1, xmm1
-
-    pcmpgtw     xmm2, xmm1                  ;calculate eob
-    pcmpgtw     xmm3, xmm1
-    packsswb    xmm2, xmm3
-    pshufb      xmm2, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm2
-
-    movdqa      [rcx], xmm0                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm4            ;store dqcoeff
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    bsr         eax, edx                    ;count 0
-    add         eax, 1
-
-    cmp         edx, 0                      ;if all 0, eob=0
-    cmove       eax, edx
-
-    mov         BYTE PTR [rcx], al          ;store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
new file mode 100644
index 0000000..448217f
--- /dev/null
+++ b/vp8/encoder/x86/quantize_ssse3.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> /* SSSE3 */
+
+#include "vp8/encoder/block.h"
+
+/* bitscan reverse (bsr) */
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+static int bsr(int mask) {
+  int eob;
+  _BitScanReverse(&eob, mask);
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#else
+static int bsr(int mask) {
+  int eob;
+#if defined(__GNUC__) && __GNUC__
+  __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#endif
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#endif
+
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
+  int eob, mask;
+
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+  __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
+
+  DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) =
+    { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
+  __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* ASM saves Y for EOB */
+  /* I think we can ignore that because adding the sign doesn't change anything
+   * and multiplying 0 by dequant is OK as well */
+  abs0 = y0;
+  abs1 = y1;
+
+  /* Restore the sign bit. */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  x0 = _mm_mullo_epi16(x0, dequant0);
+  x1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), x0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1);
+
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpgt_epi16(abs0, zeros);
+  x1 = _mm_cmpgt_epi16(abs1, zeros);
+
+  x = _mm_packs_epi16(x0, x1);
+
+  x = _mm_shuffle_epi8(x, zig_zag);
+
+  mask = _mm_movemask_epi8(x);
+
+  eob = bsr(mask);
+
+  *d->eob = 0xFF & eob;
+}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index dfb54a5..e58cdad 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -129,7 +129,6 @@
 # common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
 
@@ -159,27 +158,14 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/save_reg_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
+VP8_COMMON_SRCS-$(ARCH_NEON_ASM)  += common/arm/reconintra_arm.c
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
@@ -187,6 +173,14 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
-
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index d7c6dd1..607382b 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -88,6 +88,7 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -96,7 +97,6 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 398172a..5733048 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -35,11 +35,12 @@
 
 #File list for neon
 # encoder
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/shortfdct_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/subtract_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
+
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
index 5476400..ab5bb69 100644
--- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
@@ -9,7 +9,7 @@
 ;
 
     EXPORT  |vp9_idct8x8_64_add_neon|
-    EXPORT  |vp9_idct8x8_10_add_neon|
+    EXPORT  |vp9_idct8x8_12_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -310,13 +310,13 @@
     bx              lr
     ENDP  ; |vp9_idct8x8_64_add_neon|
 
-;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_idct8x8_10_add_neon|
+    ENDP  ; |vp9_idct8x8_12_add_neon|
 
     END
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index acccaea..fc44ffa 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -617,7 +617,7 @@
   }
 }
 
-void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
                               int dest_stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 20b78bf..856d41e 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -421,7 +421,7 @@
   }
 }
 
-void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -1348,8 +1348,8 @@
   if (eob == 1)
     // DC only DCT coefficient
     vp9_idct8x8_1_add(input, dest, stride);
-  else if (eob <= 10)
-    vp9_idct8x8_10_add(input, dest, stride);
+  else if (eob <= 12)
+    vp9_idct8x8_12_add(input, dest, stride);
   else
     vp9_idct8x8_64_add(input, dest, stride);
 }
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index ceca795..d868776 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -33,9 +33,6 @@
 #define pair_set_epi16(a, b) \
   _mm_set_epi16(b, a, b, a, b, a, b, a)
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32(b, a, b, a)
-
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 295dd78..7754763 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -12,7 +12,7 @@
 /* Encoder forward decls */
 struct macroblock;
 struct vp9_variance_vtable;
-
+struct search_site_config;
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
@@ -58,7 +58,8 @@
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
 
 add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_4x4/;
@@ -70,10 +71,12 @@
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
+$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
 
 add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
+specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
+$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
@@ -97,7 +100,8 @@
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
 
 add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_8x8/;
@@ -109,10 +113,12 @@
 specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
+$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
 
 add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
+$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
@@ -136,7 +142,8 @@
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
+$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
 
 add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_16x16/;
@@ -148,10 +155,12 @@
 specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
+$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
 
 add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
+$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
@@ -175,7 +184,8 @@
 specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
+$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
 
 add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_32x32/;
@@ -187,10 +197,12 @@
 specialize qw/vp9_d153_predictor_32x32/;
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
+$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
 
 add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
+specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
+$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
@@ -208,37 +220,48 @@
 # Loopfilter
 #
 add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16 sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
 
 add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
 
 add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
 
 add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
+specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
+$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
 
 add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
 
 add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
+$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
 
 add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
 
 add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
+specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
+$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
 
 add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
 
 #
 # post proc
@@ -274,71 +297,91 @@
 # Sub Pixel Filters
 #
 add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
+$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
 
 add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
+$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_neon_asm=vp9_convolve8_neon;
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
 
 add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/;
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
 
 add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/;
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
+$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
 
 #
 # dct
 #
 add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/;
+specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
 
 add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
+specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
 
 add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
+specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
 
 add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/;
+specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
 
-add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
+add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
 
 add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
+specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
 
 add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 neon dspr2/;
+specialize qw/vp9_idct16x16_256_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
 
 add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 neon dspr2/;
+specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/;
+$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
 
 add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/;
+specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
 
 add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon dspr2/;
-$vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon;
+specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
 
 add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
+specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
 
 add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
+specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
 
 add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
+specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
 
 add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
 specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
@@ -379,10 +422,6 @@
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_16x16 sse2/;
-$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2;
-
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
 
@@ -392,10 +431,6 @@
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_8x8 sse2/;
-$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2;
-
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
 
@@ -563,33 +598,6 @@
 add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_h/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_v/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_hv/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_h/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_v/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_hv/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_h/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_v/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_hv/;
-
 add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad64x64x3/;
 
@@ -728,13 +736,13 @@
 specialize qw/vp9_fht16x16 sse2 avx2/;
 
 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/;
+specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
 
 add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct4x4 sse2 avx2/;
 
 add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 avx2/;
+specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct16x16 sse2 avx2/;
@@ -757,11 +765,11 @@
 specialize qw/vp9_refining_search_sad sse3/;
 $vp9_refining_search_sad_sse3=vp9_refining_search_sadx4;
 
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad sse3/;
 $vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
 
-add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_full_range_search/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
diff --git a/vp9/common/vp9_tapify.py b/vp9/common/vp9_tapify.py
deleted file mode 100644
index 99529cf..0000000
--- a/vp9/common/vp9_tapify.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-"""
-#!/usr/bin/env python
-import sys,string,os,re,math,numpy
-scale = 2**16
-def dist(p1,p2):
-  x1,y1 = p1
-  x2,y2 = p2
-  if x1==x2 and y1==y2 :
-    return 1.0 
-  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
-
-def gettaps(p):
-  def l(b):
-    return int(math.floor(b))
-  def h(b):
-    return int(math.ceil(b))
-  def t(b,p,s):
-    return int((scale*dist(b,p)+s/2)/s)
-  r,c = p
-  ul=[l(r),l(c)]
-  ur=[l(r),h(c)]
-  ll=[h(r),l(c)]
-  lr=[h(r),h(c)]
-  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
-  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
-  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
-          [ll,t(ll,p,sum)],[lr,t4]]
-
-def print_mb_taps(angle,blocksize):
-  theta = angle / 57.2957795;
-  affine = [[math.cos(theta),-math.sin(theta)],
-            [math.sin(theta),math.cos(theta)]]
-  radius = (float(blocksize)-1)/2
-  print " // angle of",angle,"degrees"
-  for y in range(blocksize) :
-    for x in range(blocksize) :
-      r,c = numpy.dot(affine,[y-radius, x-radius])
-      tps = gettaps([r+radius,c+radius])
-      for t in tps :
-        p,t = t
-        tr,tc = p
-        print " %2d, %2d, %5d, " % (tr,tc,t,),
-      print " // %2d,%2d " % (y,x)
-
-i=float(sys.argv[1])
-while  i <= float(sys.argv[2]) :
-  print_mb_taps(i,float(sys.argv[4]))
-  i=i+float(sys.argv[3])
-"""
-
-taps = []
-pt=dict()
-ptr=dict()
-for y in range(16) :
-  for x in range(16) :
-    r,c = numpy.dot(affine,[y-7.5, x-7.5])
-    tps = gettaps([r+7.5,c+7.5])
-    j=0
-    for tp in tps : 
-      p,i = tp
-      r,c = p
-      pt[y,x,j]= [p,i]
-      try: 
-        ptr[r,j,c].append([y,x])
-      except:
-        ptr[r,j,c]=[[y,x]]
-      j = j+1 
-
-for key in sorted(pt.keys()) :
-  print key,pt[key]
-
-lr = -99
-lj = -99 
-lc = 0
-
-shuf=""
-mask=""
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    if lr != r or lj != j :
-      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
-      shuf=""
-      lc = 0
-    for i in range(lc,c-1) :
-      shuf = shuf +"0"
-    shuf = shuf + hex(x)[2]
-    lc =c
-    break
-  lr = r
-  lj = j
-#  print r,j,c,ptr[r,j,c]    
-#  print 
-
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    print r,j,c,y,x 
-    break
-"""
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 13a5b5a..0231726 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -995,7 +995,7 @@
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
diff --git a/vp9/common/x86/vp9_idct_ssse3.asm b/vp9/common/x86/vp9_idct_ssse3.asm
new file mode 100644
index 0000000..2c10607
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_ssse3.asm
@@ -0,0 +1,300 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the inverse transformation. Part
+; of the functions are originally derived from the ffmpeg project.
+; Note that the current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+pw_16:      times 8 dw 16
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+%endmacro
+
+TRANSFORM_COEFFS    6270, 15137
+TRANSFORM_COEFFS    3196, 16069
+TRANSFORM_COEFFS   13623,  9102
+
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro IDCT8_1D 0
+  SUM_SUB          0,    4,    9
+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
+  pmulhrsw        m0,  m12
+  pmulhrsw        m4,  m12
+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
+
+  SUM_SUB          1,    5,    9
+  SUM_SUB          7,    3,    9
+  SUM_SUB          0,    6,    9
+  SUM_SUB          4,    2,    9
+  SUM_SUB          3,    5,    9
+  pmulhrsw        m3,  m12
+  pmulhrsw        m5,  m12
+
+  SUM_SUB          0,    7,    9
+  SUM_SUB          4,    3,    9
+  SUM_SUB          2,    5,    9
+  SUM_SUB          6,    1,    9
+
+  SWAP             3,    6
+  SWAP             1,    4
+%endmacro
+
+; This macro handles 8 pixels per line
+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
+  paddw           m%1, m11
+  paddw           m%2, m11
+  psraw           m%1, 5
+  psraw           m%2, 5
+
+  movh            m%3, [outputq]
+  movh            m%4, [outputq + strideq]
+  punpcklbw       m%3, m%5
+  punpcklbw       m%4, m%5
+  paddw           m%3, m%1
+  paddw           m%4, m%2
+  packuswb        m%3, m%5
+  packuswb        m%4, m%5
+  movh               [outputq], m%3
+  movh     [outputq + strideq], m%4
+%endmacro
+
+INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
+  mova     m8, [pd_8192]
+  mova    m11, [pw_16]
+  mova    m12, [pw_11585x2]
+
+  lea      r3, [2 * strideq]
+
+  mova     m0, [inputq +   0]
+  mova     m1, [inputq +  16]
+  mova     m2, [inputq +  32]
+  mova     m3, [inputq +  48]
+  mova     m4, [inputq +  64]
+  mova     m5, [inputq +  80]
+  mova     m6, [inputq +  96]
+  mova     m7, [inputq + 112]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+%endif
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 1cc7ab7..9dd0c44 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -677,7 +677,7 @@
 
 static void decode_tile(VP9Decoder *pbi, const TileInfo *const tile,
                         vp9_reader *r) {
-  const int num_threads = pbi->oxcf.max_threads;
+  const int num_threads = pbi->max_threads;
   VP9_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
   MACROBLOCKD *xd = &pbi->mb;
@@ -828,8 +828,7 @@
   // Decode tiles using data from tile_buffers
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1
-                                               : tile_col;
+      const int col = pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col;
       const int last_tile = tile_row == tile_rows - 1 &&
                                  col == tile_cols - 1;
       const TileBuffer *const buf = &tile_buffers[tile_row][col];
@@ -887,7 +886,7 @@
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
-  const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
   TileBuffer tile_buffers[1 << 6];
   int n;
   int final_worker = -1;
@@ -899,7 +898,7 @@
   // TODO(jzern): See if we can remove the restriction of passing in max
   // threads to the decoder.
   if (pbi->num_tile_workers == 0) {
-    const int num_threads = pbi->oxcf.max_threads & ~1;
+    const int num_threads = pbi->max_threads & ~1;
     int i;
     // TODO(jzern): Allocate one less worker, as in the current code we only
     // use num_threads - 1 workers.
@@ -1328,7 +1327,7 @@
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+    if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
     }
@@ -1353,7 +1352,7 @@
 
   // TODO(jzern): remove frame_parallel_decoding_mode restriction for
   // single-frame tile decoding.
-  if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
       cm->frame_parallel_decoding_mode) {
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
   } else {
@@ -1370,15 +1369,17 @@
                          "A stream must start with a complete key frame");
   }
 
-  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(cm);
+  if (!new_fb->corrupted) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_coef_probs(cm);
 
-    if (!frame_is_intra_only(cm)) {
-      vp9_adapt_mode_probs(cm);
-      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      if (!frame_is_intra_only(cm)) {
+        vp9_adapt_mode_probs(cm);
+        vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      }
+    } else {
+      debug_check_frame_counts(cm);
     }
-  } else {
-    debug_check_frame_counts(cm);
   }
 
   if (cm->refresh_frame_context)
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index faf710c..8f72aa4 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,74 +32,6 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_dthread.h"
 
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER == 1
-static void recon_write_yuv_frame(const char *name,
-                                  const YV12_BUFFER_CONFIG *s,
-                                  int w, int _h) {
-  FILE *yuv_file = fopen(name, "ab");
-  const uint8_t *src = s->y_buffer;
-  int h = _h;
-
-  do {
-    fwrite(src, w, 1,  yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = (_h + 1) >> 1;
-  w = (w + 1) >> 1;
-
-  do {
-    fwrite(src, w, 1,  yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = (_h + 1) >> 1;
-
-  do {
-    fwrite(src, w, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  fclose(yuv_file);
-}
-#endif
-#if WRITE_RECON_BUFFER == 2
-void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  // write the frame
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
 void vp9_initialize_dec() {
   static int init_done = 0;
 
@@ -110,7 +42,7 @@
   }
 }
 
-VP9Decoder *vp9_decoder_create(const VP9DecoderConfig *oxcf) {
+VP9Decoder *vp9_decoder_create() {
   VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
   VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
 
@@ -134,7 +66,6 @@
   vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
 
   cm->current_video_frame = 0;
-  pbi->oxcf = *oxcf;
   pbi->ready_for_new_data = 1;
   pbi->decoded_key_frame = 0;
 
@@ -348,15 +279,6 @@
 
   swap_frame_buffers(pbi);
 
-#if WRITE_RECON_BUFFER == 2
-  if (cm->show_frame)
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
   if (!pbi->do_loopfilter_inline) {
     // If multiple threads are used to decode tiles, then we use those threads
     // to do parallel loopfiltering.
@@ -367,21 +289,6 @@
     }
   }
 
-#if WRITE_RECON_BUFFER == 2
-  if (cm->show_frame)
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
-#if WRITE_RECON_BUFFER == 1
-  if (cm->show_frame)
-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show,
-                          cm->width, cm->height);
-#endif
-
   vp9_clear_system_state();
 
   cm->last_width = cm->width;
@@ -423,10 +330,6 @@
   ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
     *sd = *pbi->common.frame_to_show;
-    sd->y_width = pbi->common.width;
-    sd->y_height = pbi->common.height;
-    sd->uv_width = sd->y_width >> pbi->common.subsampling_x;
-    sd->uv_height = sd->y_height >> pbi->common.subsampling_y;
     ret = 0;
 #endif /*!CONFIG_POSTPROC*/
   vp9_clear_system_state();
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index ebcbb90..66b0f15 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -27,21 +27,11 @@
 extern "C" {
 #endif
 
-typedef struct VP9DecoderConfig {
-  int width;
-  int height;
-  int version;
-  int max_threads;
-  int inv_tile_order;
-} VP9DecoderConfig;
-
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
   DECLARE_ALIGNED(16, VP9_COMMON, common);
 
-  VP9DecoderConfig oxcf;
-
   int64_t last_time_stamp;
   int ready_for_new_data;
 
@@ -59,6 +49,9 @@
 
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
+
+  int max_threads;
+  int inv_tile_order;
 } VP9Decoder;
 
 void vp9_initialize_dec();
@@ -83,8 +76,7 @@
 int vp9_get_reference_dec(struct VP9Decoder *pbi,
                           int index, YV12_BUFFER_CONFIG **fb);
 
-
-struct VP9Decoder *vp9_decoder_create(const VP9DecoderConfig *oxcf);
+struct VP9Decoder *vp9_decoder_create();
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 9098063..019b0ff 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -140,7 +140,7 @@
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
   int i;
 
   // Allocate memory used in thread synchronization.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c5a85c9..35d2ecf 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1006,9 +1006,10 @@
     found = cm->width == cfg->y_crop_width &&
             cm->height == cfg->y_crop_height;
 
-    // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it
-    // in a better way.
-    if (cpi->use_svc) {
+    // Set "found" to 0 for temporal svc and for spatial svc key frame
+    if (cpi->use_svc &&
+        (cpi->svc.number_spatial_layers == 1 ||
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
       found = 0;
     }
     vp9_wb_write_bit(wb, found);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index fcf2a04..2ccf4f8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -20,12 +20,6 @@
 extern "C" {
 #endif
 
-// motion search site
-typedef struct {
-  MV mv;
-  int offset;
-} search_site;
-
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
@@ -108,10 +102,6 @@
   int skip_optimize;
   int q_index;
 
-  search_site *ss;
-  int ss_count;
-  int searches_per_step;
-
   int errorperbit;
   int sadperbit16;
   int sadperbit4;
@@ -139,12 +129,6 @@
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
-  int mbmode_cost[INTRA_MODES];
-  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
-  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
-  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2e44f7d..fba9465 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -70,6 +70,18 @@
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
+static void get_sse_sum_8x8(const uint8_t *src, int src_stride,
+                            const uint8_t *ref, int ref_stride,
+                            unsigned int *sse, int *sum) {
+  variance(src, src_stride, ref, ref_stride, 8, 8, sse, sum);
+}
+
+static void get_sse_sum_16x16(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse, int *sum) {
+  variance(src, src_stride, ref, ref_stride, 16, 16, sse, sum);
+}
+
 static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
                                               const struct buf_2d *ref,
                                               BLOCK_SIZE bs) {
@@ -242,7 +254,6 @@
 }
 
 static void set_block_size(VP9_COMP * const cpi,
-                           const TileInfo *const tile,
                            int mi_row, int mi_col,
                            BLOCK_SIZE bsize) {
   if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
@@ -365,11 +376,9 @@
 
 static int set_vt_partitioning(VP9_COMP *cpi,
                                void *data,
-                               const TileInfo *const tile,
                                BLOCK_SIZE bsize,
                                int mi_row,
-                               int mi_col,
-                               int mi_size) {
+                               int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
@@ -386,7 +395,7 @@
   if (mi_col + block_width / 2 < cm->mi_cols &&
       mi_row + block_height / 2 < cm->mi_rows &&
       vt.part_variances->none.variance < threshold) {
-    set_block_size(cpi, tile, mi_row, mi_col, bsize);
+    set_block_size(cpi, mi_row, mi_col, bsize);
     return 1;
   }
 
@@ -395,8 +404,8 @@
       vt.part_variances->vert[0].variance < threshold &&
       vt.part_variances->vert[1].variance < threshold) {
     BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-    set_block_size(cpi, tile, mi_row, mi_col, subsize);
-    set_block_size(cpi, tile, mi_row, mi_col + block_width / 2, subsize);
+    set_block_size(cpi, mi_row, mi_col, subsize);
+    set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
     return 1;
   }
 
@@ -405,8 +414,8 @@
       vt.part_variances->horz[0].variance < threshold &&
       vt.part_variances->horz[1].variance < threshold) {
     BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-    set_block_size(cpi, tile, mi_row, mi_col, subsize);
-    set_block_size(cpi, tile, mi_row + block_height / 2, mi_col, subsize);
+    set_block_size(cpi, mi_row, mi_col, subsize);
+    set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
     return 1;
   }
   return 0;
@@ -475,8 +484,8 @@
         unsigned int sse = 0;
         int sum = 0;
         if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                              d + y_idx * dp + x_idx, dp, &sse, &sum);
+          get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+                          d + y_idx * dp + x_idx, dp, &sse, &sum);
         fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
       }
     }
@@ -493,13 +502,13 @@
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, tile, BLOCK_64X64,
-                           mi_row, mi_col, 8)) {
+  if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64,
+                           mi_row, mi_col)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], tile, BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx), 4)) {
+      if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx))) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
@@ -509,7 +518,7 @@
 #ifdef DISABLE_8X8_VAR_BASED_PARTITION
           if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows &&
               mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) {
-            set_block_size(cpi, tile,
+            set_block_size(cpi,
                            (mi_row + y32_idx + y16_idx),
                            (mi_col + x32_idx + x16_idx),
                            BLOCK_16X16);
@@ -517,7 +526,7 @@
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cpi, tile,
+              set_block_size(cpi,
                              (mi_row + y32_idx + y16_idx + y8_idx),
                              (mi_col + x32_idx + x16_idx + x8_idx),
                              BLOCK_8X8);
@@ -531,7 +540,7 @@
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cpi, tile,
+              set_block_size(cpi,
                              (mi_row + y32_idx + y16_idx + y8_idx),
                              (mi_col + x32_idx + x16_idx + x8_idx),
                              BLOCK_8X8);
@@ -560,11 +569,6 @@
   return act < (8 << 12) ? MIN(act, 5 << 12) : act;
 }
 
-// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
-  return vp9_encode_intra(x, use_dc_pred);
-}
-
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
@@ -1216,10 +1220,9 @@
         int b_offset = b_mi_row * MI_SIZE * src_stride +
                        b_mi_col * MI_SIZE;
 
-        vp9_get_sse_sum_16x16(src + b_offset,
-                              src_stride,
-                              pre_src + b_offset,
-                              pre_stride, &d16[j].sse, &d16[j].sum);
+        get_sse_sum_16x16(src + b_offset, src_stride,
+                          pre_src + b_offset, pre_stride,
+                          &d16[j].sse, &d16[j].sum);
 
         d16[j].var = d16[j].sse -
             (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -1450,8 +1453,7 @@
                              MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon, PC_TREE *pc_tree,
-                             int block) {
+                             int do_recon, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1618,7 +1620,7 @@
 
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
-                         i != 3, pc_tree->split[i], i);
+                         i != 3, pc_tree->split[i]);
         if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT64_MAX;
@@ -1865,6 +1867,67 @@
   *max_block_size = max_size;
 }
 
+static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                 int mi_row, int mi_col,
+                                 BLOCK_SIZE *min_block_size,
+                                 BLOCK_SIZE *max_block_size) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
+  const int left_in_image = xd->left_available && mi_8x8[-1];
+  const int above_in_image = xd->up_available &&
+                             mi_8x8[-xd->mi_stride];
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_32X32;
+  BLOCK_SIZE max_size = BLOCK_8X8;
+  int bsl = mi_width_log2_lookup[BLOCK_64X64];
+  int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
+                           cpi->sf.chessboard_index) & 0x01;
+  // Trap case where we do not have a prediction.
+  if (search_range_ctrl &&
+      (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
+    int block;
+    MODE_INFO **mi;
+    BLOCK_SIZE sb_type;
+
+    // Find the min and max partition sizes used in the left SB64.
+    if (left_in_image) {
+      MODE_INFO *cur_mi;
+      mi = &mi_8x8[-1];
+      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
+        cur_mi = mi[block * xd->mi_stride];
+        sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
+        min_size = MIN(min_size, sb_type);
+        max_size = MAX(max_size, sb_type);
+      }
+    }
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
+      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
+        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
+        min_size = MIN(min_size, sb_type);
+        max_size = MAX(max_size, sb_type);
+      }
+    }
+
+    min_size = min_partition_size[min_size];
+    max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
+                                   &bh, &bw);
+    min_size = MIN(min_size, max_size);
+    min_size = MAX(min_size, BLOCK_8X8);
+    max_size = MIN(max_size, BLOCK_32X32);
+  } else {
+    min_size = BLOCK_8X8;
+    max_size = BLOCK_32X32;
+  }
+
+  *min_block_size = min_size;
+  *max_block_size = max_size;
+}
+
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
@@ -1880,7 +1943,7 @@
                               TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd,
-                              PC_TREE *pc_tree, int block) {
+                              PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2032,7 +2095,7 @@
 
         rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rate, &this_dist, i != 3,
-                          best_rd - sum_rd, pc_tree->split[i], i);
+                          best_rd - sum_rd, pc_tree->split[i]);
 
         if (this_rate == INT_MAX) {
           sum_rd = INT64_MAX;
@@ -2241,18 +2304,18 @@
         set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
                                sf->always_this_block_size);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, x->pc_root, 0);
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
       } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
         BLOCK_SIZE bsize;
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
         set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, x->pc_root, 0);
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
       } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
         choose_partitioning(cpi, tile, mi_row, mi_col);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1, x->pc_root, 0);
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
       } else {
         if ((cm->current_video_frame
             % sf->last_partitioning_redo_frequency) == 0
@@ -2271,8 +2334,7 @@
                                     &sf->max_partition_size);
           }
           rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                            &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root,
-                            0);
+                            &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
         } else {
           if (sf->constrain_copy_partition &&
               sb_has_motion(cm, prev_mi_8x8))
@@ -2281,7 +2343,7 @@
           else
             copy_partitioning(cm, mi_8x8, prev_mi_8x8);
           rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                           &dummy_rate, &dummy_dist, 1, x->pc_root, 0);
+                           &dummy_rate, &dummy_dist, 1, x->pc_root);
         }
       }
     } else {
@@ -2293,7 +2355,7 @@
                                 &sf->max_partition_size);
       }
       rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root, 0);
+                        &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
     }
   }
 }
@@ -2617,9 +2679,7 @@
 
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
-
       load_pred_mv(x, ctx);
-
       nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
                            subsize, &this_rate, &this_dist, 0,
                            best_rd - sum_rd, pc_tree->split[i]);
@@ -2909,6 +2969,10 @@
       case REFERENCE_PARTITION:
         if (cpi->sf.partition_check ||
             !is_background(cpi, tile, mi_row, mi_col)) {
+          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+          auto_partition_range(cpi, tile, mi_row, mi_col,
+                               &cpi->sf.min_partition_size,
+                               &cpi->sf.max_partition_size);
           nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                                &dummy_rate, &dummy_dist, 1, INT64_MAX,
                                x->pc_root);
@@ -3057,6 +3121,23 @@
 #endif
 }
 
+static INTERP_FILTER get_interp_filter(
+    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
+  if (!is_alt_ref &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SMOOTH;
+  } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
+             threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SHARP;
+  } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP;
+  } else {
+    return SWITCHABLE;
+  }
+}
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
@@ -3092,59 +3173,41 @@
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
-    const int64_t *mode_thresh = rd_opt->prediction_type_threshes[frame_type];
-    const int64_t *filter_thresh = rd_opt->filter_threshes[frame_type];
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
+    int *const tx_thrs = rd_opt->tx_select_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (frame_type == ALTREF_FRAME || !cm->allow_comp_inter_inter)
+    if (is_alt_ref || !cm->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
-    else if (mode_thresh[COMPOUND_REFERENCE] > mode_thresh[SINGLE_REFERENCE] &&
-             mode_thresh[COMPOUND_REFERENCE] >
-                 mode_thresh[REFERENCE_MODE_SELECT] &&
+    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+             mode_thrs[COMPOUND_REFERENCE] >
+                 mode_thrs[REFERENCE_MODE_SELECT] &&
              check_dual_ref_flags(cpi) &&
              cpi->static_mb_pct == 100)
       cm->reference_mode = COMPOUND_REFERENCE;
-    else if (mode_thresh[SINGLE_REFERENCE] > mode_thresh[REFERENCE_MODE_SELECT])
+    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
       cm->reference_mode = SINGLE_REFERENCE;
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
-    if (cm->interp_filter == SWITCHABLE) {
-      if (frame_type != ALTREF_FRAME &&
-          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP] &&
-          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP_SHARP] &&
-          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[SWITCHABLE - 1]) {
-        cm->interp_filter = EIGHTTAP_SMOOTH;
-      } else if (filter_thresh[EIGHTTAP_SHARP] > filter_thresh[EIGHTTAP] &&
-          filter_thresh[EIGHTTAP_SHARP] > filter_thresh[SWITCHABLE - 1]) {
-        cm->interp_filter = EIGHTTAP_SHARP;
-      } else if (filter_thresh[EIGHTTAP] > filter_thresh[SWITCHABLE - 1]) {
-        cm->interp_filter = EIGHTTAP;
-      }
-    }
+    if (cm->interp_filter == SWITCHABLE)
+      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
     encode_frame_internal(cpi);
 
-    for (i = 0; i < REFERENCE_MODES; ++i) {
-      const int diff = (int) (rd_opt->comp_pred_diff[i] / cm->MBs);
-      rd_opt->prediction_type_threshes[frame_type][i] += diff;
-      rd_opt->prediction_type_threshes[frame_type][i] >>= 1;
-    }
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      const int64_t diff = rd_opt->filter_diff[i] / cm->MBs;
-      rd_opt->filter_threshes[frame_type][i] =
-          (rd_opt->filter_threshes[frame_type][i] + diff) / 2;
-    }
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2;
 
     for (i = 0; i < TX_MODES; ++i) {
       int64_t pd = rd_opt->tx_select_diff[i];
-      int diff;
       if (i == TX_MODE_SELECT)
         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
-      diff = (int) (pd / cm->MBs);
-      rd_opt->tx_select_threshes[frame_type][i] += diff;
-      rd_opt->tx_select_threshes[frame_type][i] /= 2;
+      tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2;
     }
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index baa4665..3b231b7 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -99,7 +99,7 @@
 }
 
 static int optimize_b(MACROBLOCK *mb, int plane, int block,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx) {
+                      TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -381,7 +381,7 @@
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     const int ctx = combine_entropy_contexts(*a, *l);
-    *a = *l = optimize_b(x, plane, block, plane_bsize, tx_size, ctx) > 0;
+    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
   } else {
     *a = *l = p->eobs[block] > 0;
   }
@@ -601,15 +601,3 @@
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
                                          &arg);
 }
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mi[0]->mbmi;
-  x->skip_encode = 0;
-  mbmi->mode = DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
-                                                                 : TX_8X8)
-                                   : TX_4X4;
-  vp9_encode_intra_block_plane(x, mbmi->sb_type, 0);
-  return vp9_get_mb_ss(x->plane[0].src_diff);
-}
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index edef1e2..8021459 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -34,8 +34,6 @@
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6816f55..5d38447 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -115,22 +115,6 @@
   }
 }
 
-static void setup_key_frame(VP9_COMP *cpi) {
-  vp9_setup_past_independence(&cpi->common);
-
-  // All buffers are implicitly updated on key frames.
-  cpi->refresh_golden_frame = 1;
-  cpi->refresh_alt_ref_frame = 1;
-}
-
-static void setup_inter_frame(VP9_COMMON *cm) {
-  if (cm->error_resilient_mode || cm->intra_only)
-    vp9_setup_past_independence(cm);
-
-  assert(cm->frame_context_idx < FRAME_CONTEXTS);
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
-}
-
 static void setup_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   // Set up entropy context depending on frame type. The decoder mandates
@@ -138,17 +122,21 @@
   // frames where the error_resilient_mode or intra_only flag is set. For
   // other inter-frames the encoder currently uses only two contexts;
   // context 1 for ALTREF frames and context 0 for the others.
-  if (cm->frame_type == KEY_FRAME) {
-    setup_key_frame(cpi);
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp9_setup_past_independence(cm);
   } else {
-    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc)
-        cm->frame_context_idx = cpi->refresh_alt_ref_frame;
-     setup_inter_frame(cm);
+    if (!cpi->use_svc)
+      cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+  } else {
+    cm->fc = cm->frame_contexts[cm->frame_context_idx];
   }
 }
 
-
-
 void vp9_initialize_enc() {
   static int init_done = 0;
 
@@ -162,6 +150,7 @@
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
     vp9_entropy_mode_init();
+    vp9_temporal_filter_init();
     init_done = 1;
   }
 }
@@ -501,9 +490,9 @@
     int y_stride = cpi->scaled_source.y_stride;
 
     if (cpi->sf.search_method == NSTEP) {
-      vp9_init3smotion_compensation(&cpi->mb, y_stride);
+      vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
     } else if (cpi->sf.search_method == DIAMOND) {
-      vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
+      vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
     }
   }
 
@@ -536,7 +525,6 @@
 
 static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
-  int i;
 
   cpi->oxcf = *oxcf;
 
@@ -571,10 +559,6 @@
   cpi->alt_fb_idx = 2;
 
   set_tile_limits(cpi);
-
-  cpi->fixed_divide[0] = 0;
-  for (i = 1; i < 512; i++)
-    cpi->fixed_divide[i] = 0x80000 / i;
 }
 
 static int get_pass(MODE mode) {
@@ -765,7 +749,7 @@
 
 
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
-  int i, j;
+  unsigned int i, j;
   VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
   VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
 
@@ -782,9 +766,6 @@
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                             (MAX_MVSEARCH_STEPS * 8) + 1));
-
   vp9_rtcd();
 
   cpi->use_svc = 0;
@@ -973,95 +954,73 @@
       cpi->rd.thresh_freq_fact[i][j] = 32;
   }
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
-            SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].sdaf           = SDAF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
     cpi->fn_ptr[BT].svaf           = SVAF; \
-    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
-    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
-    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
     cpi->fn_ptr[BT].sdx3f          = SDX3F; \
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
       vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x16x4d)
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
       vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad16x32x4d)
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
       vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad64x32x4d)
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
       vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x64x4d)
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
       vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
-      vp9_variance_halfpixvar32x32_v,
-      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
+      vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
       vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
-      vp9_variance_halfpixvar64x64_v,
-      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
+      vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
       vp9_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
-      vp9_variance_halfpixvar16x16_v,
-      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
       vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
       vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance16x8,
       vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
       vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance8x16,
       vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
       vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance8x8,
       vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
   BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
       vp9_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, NULL,
-      NULL, NULL, vp9_sad8x4x8,
-      vp9_sad8x4x4d)
+      vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
 
   BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
       vp9_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, NULL,
-      NULL, NULL, vp9_sad4x8x8,
-      vp9_sad4x8x4d)
+      vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
       vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance4x4,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
@@ -1079,15 +1038,11 @@
 
   cm->error.setjmp = 0;
 
-#ifdef MODE_TEST_HIT_STATS
-  vp9_zero(cpi->mode_test_hits);
-#endif
-
   return cpi;
 }
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
-  int i;
+  unsigned int i;
 
   if (!cpi)
     return;
@@ -1141,34 +1096,6 @@
 
 #endif
 
-#ifdef MODE_TEST_HIT_STATS
-    if (cpi->pass != 1) {
-      double norm_per_pixel_mode_tests = 0;
-      double norm_counts[BLOCK_SIZES];
-      int i;
-      int sb64_per_frame;
-      int norm_factors[BLOCK_SIZES] =
-        {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
-      FILE *f = fopen("mode_hit_stats.stt", "a");
-
-      // On average, how many mode tests do we do
-      for (i = 0; i < BLOCK_SIZES; ++i) {
-        norm_counts[i] = (double)cpi->mode_test_hits[i] /
-                         (double)norm_factors[i];
-        norm_per_pixel_mode_tests += norm_counts[i];
-      }
-      // Convert to a number per 64x64 and per frame
-      sb64_per_frame = ((cpi->common.height + 63) / 64) *
-                       ((cpi->common.width + 63) / 64);
-      norm_per_pixel_mode_tests =
-        norm_per_pixel_mode_tests /
-        (double)(cpi->common.current_video_frame * sb64_per_frame);
-
-      fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
-      fclose(f);
-    }
-#endif
-
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
@@ -1182,7 +1109,6 @@
   }
 
   dealloc_compressor_data(cpi);
-  vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
@@ -1444,77 +1370,67 @@
 }
 #endif
 
-static void scale_and_extend_frame_nonnormative(YV12_BUFFER_CONFIG *src_fb,
-                                                YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_crop_width;
-  const int in_h = src_fb->y_crop_height;
-  const int out_w = dst_fb->y_crop_width;
-  const int out_h = dst_fb->y_crop_height;
-  const int in_w_uv = src_fb->uv_crop_width;
-  const int in_h_uv = src_fb->uv_crop_height;
-  const int out_w_uv = dst_fb->uv_crop_width;
-  const int out_h_uv = dst_fb->uv_crop_height;
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  const int src_widths[4] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width, src->y_crop_width};
+  const int src_heights[4] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height, src->y_crop_height};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
+  const int dst_widths[4] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width, dst->y_crop_width};
+  const int dst_heights[4] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height, dst->y_crop_height};
 
-  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
-    src_fb->alpha_buffer};
-  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
-    src_fb->alpha_stride};
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
 
-  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
-    dst_fb->alpha_buffer};
-  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
-    dst_fb->alpha_stride};
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    if (i == 0 || i == 3) {
-      // Y and alpha planes
-      vp9_resize_plane(srcs[i], in_h, in_w, src_strides[i],
-                       dsts[i], out_h, out_w, dst_strides[i]);
-    } else {
-      // Chroma planes
-      vp9_resize_plane(srcs[i], in_h_uv, in_w_uv, src_strides[i],
-                       dsts[i], out_h_uv, out_w_uv, dst_strides[i]);
-    }
-  }
   // TODO(hkuang): Call C version explicitly
   // as neon version only expand border size 32.
-  vp8_yv12_extend_frame_borders_c(dst_fb);
+  vp8_yv12_extend_frame_borders_c(dst);
 }
 
-static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
-                                   YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_crop_width;
-  const int in_h = src_fb->y_crop_height;
-  const int out_w = dst_fb->y_crop_width;
-  const int out_h = dst_fb->y_crop_height;
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
   int x, y, i;
 
-  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
-                      src_fb->alpha_buffer};
-  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
-                        src_fb->alpha_stride};
-
-  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
-                      dst_fb->alpha_buffer};
-  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
-                        dst_fb->alpha_stride};
-
-  for (y = 0; y < out_h; y += 16) {
-    for (x = 0; x < out_w; x += 16) {
+  for (y = 0; y < dst_h; y += 16) {
+    for (x = 0; x < dst_w; x += 16) {
       for (i = 0; i < MAX_MB_PLANE; ++i) {
         const int factor = (i == 0 || i == 3 ? 1 : 2);
-        const int x_q4 = x * (16 / factor) * in_w / out_w;
-        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int y_q4 = y * (16 / factor) * src_h / dst_h;
         const int src_stride = src_strides[i];
         const int dst_stride = dst_strides[i];
-        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
-                                 x / factor * in_w / out_w;
-        uint8_t *dst = dsts[i] + y / factor * dst_stride + x / factor;
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                     src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
-        vp9_convolve8(src, src_stride, dst, dst_stride,
-                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+        vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * src_w / dst_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
       }
     }
@@ -1522,7 +1438,7 @@
 
   // TODO(hkuang): Call C version explicitly
   // as neon version only expand border size 32.
-  vp8_yv12_extend_frame_borders_c(dst_fb);
+  vp8_yv12_extend_frame_borders_c(dst);
 }
 
 static int find_fp_qindex() {
@@ -1701,7 +1617,7 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
+    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
 
     if (ref->y_crop_width != cm->width ||
         ref->y_crop_height != cm->height) {
@@ -2133,7 +2049,7 @@
   cm->lf.mode_ref_delta_update = 0;
 
   // Initialize cpi->mv_step_param to default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+  cpi->mv_step_param = vp9_init_search_range(sf, max_mv_def);
   // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
   if (sf->auto_mv_step_size) {
     if (frame_is_intra_only(cm)) {
@@ -2145,7 +2061,7 @@
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(cpi, MIN(max_mv_def, 2 *
+        cpi->mv_step_param = vp9_init_search_range(sf, MIN(max_mv_def, 2 *
                                  cpi->max_mv_magnitude));
       cpi->max_mv_magnitude = 0;
     }
@@ -2308,9 +2224,6 @@
     }
   }
 
-#if 0
-  output_frame_level_debug_stats(cpi);
-#endif
   if (cpi->refresh_golden_frame == 1)
     cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
   else
@@ -2326,6 +2239,10 @@
   cm->last_frame_type = cm->frame_type;
   vp9_rc_postencode_update(cpi, *size);
 
+#if 0
+  output_frame_level_debug_stats(cpi);
+#endif
+
   if (cm->frame_type == KEY_FRAME) {
     // Tell the caller that the frame was coded as a key frame
     *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8f32494..f48909e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -40,8 +40,6 @@
 extern "C" {
 #endif
 
-// #define MODE_TEST_HIT_STATS
-
 #define DEFAULT_GF_INTERVAL         10
 
 #define MAX_MODES 30
@@ -276,6 +274,7 @@
 
   int arnr_max_frames;
   int arnr_strength;
+  int arnr_type;
 
   int tile_columns;
   int tile_rows;
@@ -443,7 +442,6 @@
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
-  int fixed_divide[512];
 
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
@@ -497,6 +495,14 @@
 
   int frame_flags;
 
+  search_site_config ss_cfg;
+
+  int mbmode_cost[INTRA_MODES];
+  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+
 #if CONFIG_MULTIPLE_ARF
   // ARF tracking variables.
   int multi_arf_enabled;
@@ -509,11 +515,6 @@
   int this_frame_weight;
   int max_arf_level;
 #endif
-
-#ifdef MODE_TEST_HIT_STATS
-  // Debug / test stats
-  int64_t mode_test_hits[BLOCK_SIZES];
-#endif
 } VP9_COMP;
 
 void vp9_initialize_enc();
@@ -592,7 +593,8 @@
 // alt ref frames tend to be coded at a higher than ambient quality
 static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
+         vp9_is_upper_layer_key_frame(cpi);
 }
 
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3d4b962..43ae7c1 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -61,7 +61,7 @@
 #define MIN_GF_INTERVAL             4
 #endif
 
-#define LONG_TERM_VBR_CORRECTION
+// #define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -418,7 +418,7 @@
   v_fn_ptr.vf = get_block_variance_fn(bsize);
 
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
                                     step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
@@ -441,7 +441,7 @@
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
                                         &num00, &v_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
@@ -604,7 +604,13 @@
       }
 
       // Do intra 16x16 prediction.
-      this_error = vp9_encode_intra(x, use_dc_pred);
+      x->skip_encode = 0;
+      xd->mi[0]->mbmi.mode = DC_PRED;
+      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
+         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      vp9_encode_intra_block_plane(x, bsize, 0);
+      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
         vp9_clear_system_state();
         this_error = (int)(this_error * error_weight);
@@ -920,12 +926,19 @@
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
                                             BPER_MB_NORMBITS) / num_mbs;
     int q;
+    int is_svc_upper_layer = 0;
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+        cpi->svc.spatial_layer_id > 0) {
+      is_svc_upper_layer = 1;
+    }
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
-      const double factor = calc_correction_factor(err_per_mb, ERR_DIVISOR,
-                                                   0.5, 0.90, q);
+      const double factor =
+          calc_correction_factor(err_per_mb, ERR_DIVISOR,
+                                 is_svc_upper_layer ? 0.8 : 0.5,
+                                 is_svc_upper_layer ? 1.0 : 0.90, q);
       const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
                                                  factor * speed_term);
       if (bits_per_mb <= target_norm_bits_per_mb)
@@ -1404,6 +1417,58 @@
 }
 #endif
 
+// Calculate a section intra ratio used in setting max loop filter.
+static void calculate_section_intra_ratio(struct twopass_rc *twopass,
+                                          const FIRSTPASS_STATS *start_pos,
+                                          int section_length) {
+  FIRSTPASS_STATS next_frame = { 0 };
+  FIRSTPASS_STATS sectionstats = { 0 };
+  int i;
+
+  reset_fpf_position(twopass, start_pos);
+
+  for (i = 0; i < section_length; ++i) {
+    input_stats(twopass, &next_frame);
+    accumulate_stats(&sectionstats, &next_frame);
+  }
+
+  avg_stats(&sectionstats);
+
+  twopass->section_intra_rating =
+    (int)(sectionstats.intra_error /
+          DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+  reset_fpf_position(twopass, start_pos);
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const struct twopass_rc *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0) ?
+     0 : (total_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1429,8 +1494,6 @@
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  // Max bits for a single frame.
-  const int max_bits = frame_max_bits(rc, oxcf);
   unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames;
 
   int f_boost = 0;
@@ -1645,21 +1708,8 @@
 #endif
 #endif
 
-  // Calculate the bits to be allocated to the group as a whole.
-  if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
-    twopass->gf_group_bits = (int64_t)(twopass->kf_group_bits *
-                (gf_group_err / twopass->kf_group_error_left));
-  } else {
-    twopass->gf_group_bits = 0;
-  }
-  twopass->gf_group_bits = (twopass->gf_group_bits < 0) ?
-     0 : (twopass->gf_group_bits > twopass->kf_group_bits) ?
-     twopass->kf_group_bits : twopass->gf_group_bits;
-
-  // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit, cpi->oxcf.two_pass_vbrmax_section.
-  if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  twopass->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
@@ -1727,8 +1777,8 @@
       twopass->gf_bits = gf_bits;
     }
     if (i == 1 ||
-        (!rc->source_alt_ref_pending &&
-         cpi->common.frame_type != KEY_FRAME)) {
+        (!rc->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME &&
+         !vp9_is_upper_layer_key_frame(cpi))) {
       // Calculate the per frame bit target for this frame.
       vp9_rc_set_frame_target(cpi, gf_bits);
     }
@@ -1769,24 +1819,9 @@
     }
   }
 
+  // Calculate a section intra ratio used in setting max loop filter.
   if (cpi->common.frame_type != KEY_FRAME) {
-    FIRSTPASS_STATS sectionstats;
-
-    zero_stats(&sectionstats);
-    reset_fpf_position(twopass, start_pos);
-
-    for (i = 0; i < rc->baseline_gf_interval; ++i) {
-      input_stats(twopass, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
-    }
-
-    avg_stats(&sectionstats);
-
-    twopass->section_intra_rating = (int)
-      (sectionstats.intra_error /
-      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
-
-    reset_fpf_position(twopass, start_pos);
+    calculate_section_intra_ratio(twopass, start_pos, rc->baseline_gf_interval);
   }
 }
 
@@ -2082,25 +2117,8 @@
     }
   }
 
-  {
-    FIRSTPASS_STATS sectionstats;
-
-    zero_stats(&sectionstats);
-    reset_fpf_position(twopass, start_position);
-
-    for (i = 0; i < rc->frames_to_key; ++i) {
-      input_stats(twopass, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
-    }
-
-    avg_stats(&sectionstats);
-
-    twopass->section_intra_rating = (int) (sectionstats.intra_error /
-        DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
-  }
-
-  // Reset the first pass file position.
-  reset_fpf_position(twopass, start_position);
+  // Calculate a section intra ratio used in setting max loop filter.
+  calculate_section_intra_ratio(twopass, start_position, rc->frames_to_key);
 
   // Work out how many bits to allocate for the key frame itself.
   if (1) {
@@ -2290,11 +2308,16 @@
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
     // Don't place key frame in any enhancement layers in spatial svc
-    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
-        cpi->svc.spatial_layer_id > 0) {
-      cm->frame_type = INTER_FRAME;
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      lc->is_key_frame = 1;
+      if (cpi->svc.spatial_layer_id > 0) {
+        cm->frame_type = INTER_FRAME;
+      }
     }
   } else {
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      lc->is_key_frame = 0;
+    }
     cm->frame_type = INTER_FRAME;
   }
 
@@ -2384,17 +2407,19 @@
   const double progress =
       (double)(cpi->twopass.stats_in - cpi->twopass.stats_in_start) /
               (cpi->twopass.stats_in_end - cpi->twopass.stats_in_start);
-  const int bits_used = progress * cpi->rc.this_frame_target +
-                        (1.0 - progress) * cpi->rc.projected_frame_size;
+  const int bits_used = (int)(progress * rc->this_frame_target +
+                             (1.0 - progress) * rc->projected_frame_size);
 #endif
 
   cpi->twopass.bits_left -= bits_used;
   cpi->twopass.bits_left = MAX(cpi->twopass.bits_left, 0);
 
 #ifdef LONG_TERM_VBR_CORRECTION
-  if (cpi->common.frame_type != KEY_FRAME) {
+  if (cpi->common.frame_type != KEY_FRAME &&
+      !vp9_is_upper_layer_key_frame(cpi)) {
 #else
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (cpi->common.frame_type == KEY_FRAME ||
+      vp9_is_upper_layer_key_frame(cpi)) {
     // For key frames kf_group_bits already had the target bits subtracted out.
     // So now update to the correct value based on the actual bits used.
     cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - bits_used;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index a298f1c..abe71e6 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -18,6 +18,9 @@
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */
   unsigned int sz;             /* Number of buffers currently in the queue */
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 046c533..ff63c0d 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -20,9 +20,6 @@
 
 #define MAX_LAG_BUFFERS 25
 
-// The max of past frames we want to keep in the queue.
-#define MAX_PRE_FRAMES 1
-
 struct lookahead_entry {
   YV12_BUFFER_CONFIG  img;
   int64_t             ts_start;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 07a04cd..43c8ab8 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -51,7 +51,7 @@
     x->mv_row_max = row_max;
 }
 
-int vp9_init_search_range(VP9_COMP *cpi, int size) {
+int vp9_init_search_range(const SPEED_FEATURES *sf, int size) {
   int sr = 0;
 
   // Minimum search size no matter what the passed in value.
@@ -60,8 +60,8 @@
   while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  sr += cpi->sf.reduce_first_step_size;
-  sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
+  sr += sf->reduce_first_step_size;
+  sr = MIN(sr, (sf->max_step_search_steps - 2));
   return sr;
 }
 
@@ -101,32 +101,32 @@
   return 0;
 }
 
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int len, ss_count = 1;
 
-  x->ss[0].mv.col = x->ss[0].mv.row = 0;
-  x->ss[0].offset = 0;
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 4 search sites per step.
     const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
     int i;
     for (i = 0; i < 4; ++i) {
-      search_site *const ss = &x->ss[ss_count++];
+      search_site *const ss = &cfg->ss[ss_count++];
       ss->mv = ss_mvs[i];
       ss->offset = ss->mv.row * stride + ss->mv.col;
     }
   }
 
-  x->ss_count = ss_count;
-  x->searches_per_step = 4;
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 4;
 }
 
-void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
+void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
   int len, ss_count = 1;
 
-  x->ss[0].mv.col = x->ss[0].mv.row = 0;
-  x->ss[0].offset = 0;
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 8 search sites per step.
@@ -136,14 +136,14 @@
     };
     int i;
     for (i = 0; i < 8; ++i) {
-      search_site *const ss = &x->ss[ss_count++];
+      search_site *const ss = &cfg->ss[ss_count++];
       ss->mv = ss_mvs[i];
       ss->offset = ss->mv.row * stride + ss->mv.col;
     }
   }
 
-  x->ss_count = ss_count;
-  x->searches_per_step = 8;
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 8;
 }
 
 /*
@@ -871,97 +871,78 @@
 
 #undef CHECK_BETTER
 
-int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+int vp9_full_range_search_c(const MACROBLOCK *x,
+                            const search_site_config *cfg,
+                            MV *ref_mv, MV *best_mv,
                             int search_param, int sad_per_bit, int *num00,
                             const vp9_variance_fn_ptr_t *fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *in_what;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-
-  unsigned int bestsad = INT_MAX;
-  int ref_row, ref_col;
-
-  unsigned int thissad;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int range = 64;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  int tr, tc;
-  int best_tr = 0;
-  int best_tc = 0;
-  int range = 64;
-
-  int start_col, end_col;
-  int start_row, end_row;
-  int i;
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
 
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->row;
-  ref_col = ref_mv->col;
+  *best_mv = *ref_mv;
   *num00 = 11;
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                         0x7fffffff) +
+                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  start_row = MAX(-range, x->mv_row_min - ref_mv->row);
+  start_col = MAX(-range, x->mv_col_min - ref_mv->col);
+  end_row = MIN(range, x->mv_row_max - ref_mv->row);
+  end_col = MIN(range, x->mv_col_max - ref_mv->col);
 
-  // Work out the start point for the search
-  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+  for (r = start_row; r <= end_row; ++r) {
+    for (c = start_col; c <= end_col; c += 4) {
+      if (c + 3 <= end_col) {
+        unsigned int sads[4];
+        const uint8_t *addrs[4];
+        for (i = 0; i < 4; ++i) {
+          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+          addrs[i] = get_buf_from_mv(in_what, &mv);
+        }
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
-  start_row = MAX(-range, x->mv_row_min - ref_row);
-  start_col = MAX(-range, x->mv_col_min - ref_col);
-  end_row = MIN(range, x->mv_row_max - ref_row);
-  end_col = MIN(range, x->mv_col_max - ref_col);
-
-  for (tr = start_row; tr <= end_row; ++tr) {
-    for (tc = start_col; tc <= end_col; tc += 4) {
-      if ((tc + 3) <= end_col) {
-        unsigned int sad_array[4];
-        unsigned char const *addr_ref[4];
-        for (i = 0; i < 4; ++i)
-          addr_ref[i] = in_what + tr * in_what_stride + tc + i;
-
-        fn_ptr->sdx4df(what, what_stride, addr_ref, in_what_stride, sad_array);
+        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
 
         for (i = 0; i < 4; ++i) {
-          if (sad_array[i] < bestsad) {
-            const MV this_mv = {ref_row + tr, ref_col + tc + i};
-            thissad = sad_array[i] +
-                      mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_tr = tr;
-              best_tc = tc + i;
+          if (sads[i] < best_sad) {
+            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+            const unsigned int sad = sads[i] +
+                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
             }
           }
         }
       } else {
-        for (i = 0; i < end_col - tc; ++i) {
-          const uint8_t *check_here = in_what + tr * in_what_stride + tc + i;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                bestsad);
-
-          if (thissad < bestsad) {
-            const MV this_mv = {ref_row + tr, ref_col + tc + i};
-            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_tr = tr;
-              best_tc = tc + i;
+        for (i = 0; i < end_col - c; ++i) {
+          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
             }
           }
         }
       }
     }
   }
-  best_mv->row += best_tr;
-  best_mv->col += best_tc;
-  return bestsad;
+
+  return best_sad;
 }
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x,
+                             const search_site_config *cfg,
                              MV *ref_mv, MV *best_mv,
                              int search_param, int sad_per_bit, int *num00,
                              const vp9_variance_fn_ptr_t *fn_ptr,
@@ -973,8 +954,8 @@
   // of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
   // (MAX_FIRST_STEP/4) pel... etc.
-  const search_site *const ss = &x->ss[search_param * x->searches_per_step];
-  const int tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const uint8_t *best_address, *in_what_ref;
   int best_sad = INT_MAX;
@@ -996,7 +977,7 @@
   i = 1;
 
   for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < x->searches_per_step; j++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
       const MV mv = {best_mv->row + ss[i].mv.row,
                      best_mv->col + ss[i].mv.col};
       if (is_mv_in(x, &mv)) {
@@ -1050,6 +1031,7 @@
 }
 
 int vp9_diamond_search_sadx4(const MACROBLOCK *x,
+                             const search_site_config *cfg,
                              MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
                              const vp9_variance_fn_ptr_t *fn_ptr,
@@ -1075,8 +1057,8 @@
   // 0 = initial step (MAX_FIRST_STEP) pel
   // 1 = (MAX_FIRST_STEP/2) pel,
   // 2 = (MAX_FIRST_STEP/4) pel...
-  const search_site *ss = &x->ss[search_param * x->searches_per_step];
-  const int tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
 
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
@@ -1112,7 +1094,7 @@
     if (all_in) {
       unsigned int sad_array[4];
 
-      for (j = 0; j < x->searches_per_step; j += 4) {
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
         unsigned char const *block_offset[4];
 
         for (t = 0; t < 4; t++)
@@ -1135,7 +1117,7 @@
         }
       }
     } else {
-      for (j = 0; j < x->searches_per_step; j++) {
+      for (j = 0; j < cfg->searches_per_step; j++) {
         // Trap illegal vectors
         const MV this_mv = {best_mv->row + ss[i].mv.row,
                             best_mv->col + ss[i].mv.col};
@@ -1202,7 +1184,7 @@
                            const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
                                         step_param, sadpb, &n,
                                         fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
@@ -1220,7 +1202,7 @@
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, ref_mv);
       if (thissme < INT_MAX)
@@ -1355,108 +1337,89 @@
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           const MV *center_mv, MV *best_mv) {
+  int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV this_mv;
-  unsigned int bestsad = INT_MAX;
-  int r, c;
-  int ref_row = ref_mv->row;
-  int ref_col = ref_mv->col;
-
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  const int row_min = MAX(ref_row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_col + distance, x->mv_col_max);
-  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
-  unsigned int sad_array[3];
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
 
-  // Work out the mid point for the search
-  const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col];
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
 
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        unsigned int sads[8];
 
-  // Baseline value at the center
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
 
-  for (r = row_min; r < row_max; r++) {
-    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
-    this_mv.row = r;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; i++) {
-        unsigned int thissad = (unsigned int)sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.col = c;
-          thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->row = r;
-            best_mv->col = c;
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
-    while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
-      int i;
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        unsigned int sads[3];
 
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
 
-      for (i = 0; i < 3; i++) {
-        unsigned int thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.col = c;
-          thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->row = r;
-            best_mv->col = c;
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
     while (c < col_max) {
-      unsigned int thissad = fn_ptr->sdf(what, what_stride,
-                                         check_here, in_what_stride, bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.col = c;
-        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->row = r;
-          best_mv->col = c;
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride, best_sad);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
         }
       }
-
-      check_here++;
-      c++;
+      ++check_here;
+      ++c;
     }
   }
-  return bestsad;
+
+  return best_sad;
 }
 
 int vp9_refining_search_sad_c(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 70d7985..827957d 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -31,6 +31,20 @@
 // for Block_16x16
 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
 
+// motion search site
+typedef struct search_site {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  int ss_count;
+  int searches_per_step;
+} search_site_config;
+
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void vp9_init3smotion_compensation(search_site_config *cfg,  int stride);
 
 void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv);
 int vp9_mv_bit_cost(const MV *mv, const MV *ref,
@@ -46,11 +60,11 @@
                           const uint8_t *second_pred,
                           const vp9_variance_fn_ptr_t *vfp,
                           int use_mvcost);
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
 struct VP9_COMP;
-int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+struct SPEED_FEATURES;
+
+int vp9_init_search_range(const struct SPEED_FEATURES *sf, int size);
 
 // Runs sequence of diamond searches in smaller steps for RD
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
@@ -119,6 +133,7 @@
                                         const MV *center_mv);
 
 typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x,
+                                       const search_site_config *cfg,
                                        MV *ref_mv, MV *best_mv,
                                        int search_param, int sad_per_bit,
                                        int *num00,
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 7c42bb8..a4b280c 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -24,8 +24,12 @@
 #include "vp9/encoder/vp9_quantize.h"
 
 static int get_max_filter_level(const VP9_COMP *cpi) {
-  return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                               : MAX_LOOP_FILTER;
+  if (cpi->pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
 }
 
 
@@ -77,8 +81,8 @@
     // Bias against raising loop filter in favor of lowering it.
     int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    if (cpi->twopass.section_intra_rating < 20)
-      bias = bias * cpi->twopass.section_intra_rating / 20;
+    if ((cpi->pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->tx_mode != ONLY_4X4)
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 56eb944..78fba73 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -27,7 +27,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 
 static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
                                     int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -85,39 +84,9 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  if (cpi->sf.search_method == FAST_DIAMOND) {
-    // NOTE: this returns SAD
-    vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
-                        &cpi->fn_ptr[bsize], 1,
-                        &ref_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == FAST_HEX) {
-    // NOTE: this returns SAD
-    vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
-                        &cpi->fn_ptr[bsize], 1,
-                        &ref_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == HEX) {
-    // NOTE: this returns SAD
-    vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
-                   &cpi->fn_ptr[bsize], 1,
-                   &ref_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == SQUARE) {
-    // NOTE: this returns SAD
-    vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
-                      &cpi->fn_ptr[bsize], 1,
-                      &ref_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == BIGDIA) {
-    // NOTE: this returns SAD
-    vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
-                      &cpi->fn_ptr[bsize], 1,
-                      &ref_mv, &tmp_mv->as_mv);
-  } else {
-    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-    // NOTE: this returns variance
-    vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                           sadpb, further_steps, 1,
-                           &cpi->fn_ptr[bsize],
-                           &ref_mv, &tmp_mv->as_mv);
-  }
+  full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
+                    &tmp_mv->as_mv, INT_MAX, 0);
+
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
@@ -137,7 +106,6 @@
 }
 
 static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
                                     MV *tmp_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
@@ -191,10 +159,16 @@
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
 
-  int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
-                                  pd->dst.buf, pd->dst.stride, &sse);
+  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                           pd->dst.buf, pd->dst.stride, &sse);
 
-  vp9_model_rd_from_var_lapndz(sse + var, 1 << num_pels_log2_lookup[bsize],
+  // TODO(jingning) This is a temporary solution to account for frames with
+  // light changes. Need to customize the rate-distortion modeling for non-RD
+  // mode decision.
+  if ((sse >> 3) > var)
+    sse = var;
+
+  vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
                                pd->dequant[1] >> 3, &rate, &dist);
   *out_rate_sum = rate;
   *out_dist_sum = dist << 3;
@@ -314,18 +288,18 @@
         if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
 
-        full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+        full_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
                                  &frame_mv[NEWMV][ref_frame], &rate_mv);
 
         if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
           continue;
 
-        rate_mode = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                      [INTER_OFFSET(this_mode)];
+        rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                        [INTER_OFFSET(this_mode)];
         if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd)
           continue;
 
-        sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+        sub_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
                                 &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
@@ -355,24 +329,24 @@
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP],
                           &pf_dist[EIGHTTAP]);
         tmp_rdcost1 = RDCOST(x->rdmult, x->rddiv,
-                             vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP],
+                             vp9_get_switchable_rate(cpi) + pf_rate[EIGHTTAP],
                              pf_dist[EIGHTTAP]);
 
         mbmi->interp_filter = EIGHTTAP_SHARP;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SHARP],
                           &pf_dist[EIGHTTAP_SHARP]);
-        tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv,
-                          vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SHARP],
-                          pf_dist[EIGHTTAP_SHARP]);
+        tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
+                                 pf_rate[EIGHTTAP_SHARP],
+                             pf_dist[EIGHTTAP_SHARP]);
 
         mbmi->interp_filter = EIGHTTAP_SMOOTH;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SMOOTH],
                           &pf_dist[EIGHTTAP_SMOOTH]);
-        tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv,
-                          vp9_get_switchable_rate(x) + pf_rate[EIGHTTAP_SMOOTH],
-                          pf_dist[EIGHTTAP_SMOOTH]);
+        tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
+                                 pf_rate[EIGHTTAP_SMOOTH],
+                             pf_dist[EIGHTTAP_SMOOTH]);
 
         if (tmp_rdcost2 < tmp_rdcost1) {
           if (tmp_rdcost2 < tmp_rdcost3)
@@ -395,7 +369,7 @@
       }
 
       rate += rate_mv;
-      rate += x->inter_mode_cost[mbmi->mode_context[ref_frame]]
+      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                 [INTER_OFFSET(this_mode)];
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
@@ -426,7 +400,7 @@
                               &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
 
       model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
-      rate += x->mbmode_cost[this_mode];
+      rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 6ebd9f3..0f4c4da 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -40,14 +40,14 @@
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
+#define FRAME_OVERHEAD_BITS 200
+
 // Tables relating active max Q to active min Q
 static int kf_low_motion_minq[QINDEX_RANGE];
 static int kf_high_motion_minq[QINDEX_RANGE];
-static int gf_low_motion_minq[QINDEX_RANGE];
-static int gf_high_motion_minq[QINDEX_RANGE];
+static int arfgf_low_motion_minq[QINDEX_RANGE];
+static int arfgf_high_motion_minq[QINDEX_RANGE];
 static int inter_minq[QINDEX_RANGE];
-static int afq_low_motion_minq[QINDEX_RANGE];
-static int afq_high_motion_minq[QINDEX_RANGE];
 static int gf_high = 2000;
 static int gf_low = 400;
 static int kf_high = 5000;
@@ -79,13 +79,11 @@
 
   for (i = 0; i < QINDEX_RANGE; i++) {
     const double maxq = vp9_convert_qindex_to_q(i);
-    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.15);
+    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125);
     kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50);
-    gf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.32);
-    gf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
-    afq_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.33);
-    afq_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55);
-    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.75);
+    arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30);
+    arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
+    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90);
   }
 }
 
@@ -546,7 +544,7 @@
     }
     active_best_quality = get_active_quality(
         q, rc->gfu_boost, gf_low, gf_high,
-        gf_low_motion_minq, gf_high_motion_minq);
+        arfgf_low_motion_minq, arfgf_high_motion_minq);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -674,17 +672,12 @@
     if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) {
       if (q < cq_level)
         q = cq_level;
-      if (rc->frames_since_key > 1) {
-        active_best_quality = get_active_quality(q, rc->gfu_boost,
-                                                 gf_low, gf_high,
-                                                 afq_low_motion_minq,
-                                                 afq_high_motion_minq);
-      } else {
-        active_best_quality = get_active_quality(q, rc->gfu_boost,
-                                                 gf_low, gf_high,
-                                                 gf_low_motion_minq,
-                                                 gf_high_motion_minq);
-      }
+
+      active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                               gf_low, gf_high,
+                                               arfgf_low_motion_minq,
+                                               arfgf_high_motion_minq);
+
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
@@ -692,20 +685,14 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        if (rc->frames_since_key > 1) {
-          active_best_quality = get_active_quality(
-              q, rc->gfu_boost, gf_low, gf_high,
-              afq_low_motion_minq, afq_high_motion_minq);
-        } else {
-          active_best_quality = get_active_quality(
-              q, rc->gfu_boost, gf_low, gf_high,
-              gf_low_motion_minq, gf_high_motion_minq);
-        }
+        active_best_quality = get_active_quality(
+            q, rc->gfu_boost, gf_low, gf_high,
+            arfgf_low_motion_minq, arfgf_high_motion_minq);
       }
     } else {
       active_best_quality = get_active_quality(
           q, rc->gfu_boost, gf_low, gf_high,
-          gf_low_motion_minq, gf_high_motion_minq);
+          arfgf_low_motion_minq, arfgf_high_motion_minq);
     }
   } else {
     if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
@@ -807,7 +794,7 @@
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
 
-  if (frame_is_intra_only(cm)) {
+  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
 #if !CONFIG_MULTIPLE_ARF
     // Handle the special case for key frames forced when we have75 reached
     // the maximum key frame interval. Here force the Q to a range
@@ -865,17 +852,12 @@
     if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) {
       if (q < cq_level)
         q = cq_level;
-      if (rc->frames_since_key > 1) {
-        active_best_quality = get_active_quality(q, rc->gfu_boost,
-                                                 gf_low, gf_high,
-                                                 afq_low_motion_minq,
-                                                 afq_high_motion_minq);
-      } else {
-        active_best_quality = get_active_quality(q, rc->gfu_boost,
-                                                 gf_low, gf_high,
-                                                 gf_low_motion_minq,
-                                                 gf_high_motion_minq);
-      }
+
+      active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                               gf_low, gf_high,
+                                               arfgf_low_motion_minq,
+                                               arfgf_high_motion_minq);
+
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
@@ -883,20 +865,14 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        if (rc->frames_since_key > 1) {
-          active_best_quality = get_active_quality(
-              q, rc->gfu_boost, gf_low, gf_high,
-              afq_low_motion_minq, afq_high_motion_minq);
-        } else {
-          active_best_quality = get_active_quality(
-              q, rc->gfu_boost, gf_low, gf_high,
-              gf_low_motion_minq, gf_high_motion_minq);
-        }
+        active_best_quality = get_active_quality(
+            q, rc->gfu_boost, gf_low, gf_high,
+            arfgf_low_motion_minq, arfgf_high_motion_minq);
       }
     } else {
       active_best_quality = get_active_quality(
           q, rc->gfu_boost, gf_low, gf_high,
-          gf_low_motion_minq, gf_high_motion_minq);
+          arfgf_low_motion_minq, arfgf_high_motion_minq);
     }
   } else {
     if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
@@ -928,7 +904,8 @@
     vp9_clear_system_state();
 
     // Limit Q range for the adaptive loop.
-    if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+    if ((cm->frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi)) &&
+        !rc->this_key_frame_forced) {
       qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
                                           active_worst_quality, 2.0);
     } else if (!rc->is_src_frame_alt_ref &&
@@ -995,11 +972,7 @@
     q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
   }
 
-  // Q of 0 is disabled because we force tx size to be
-  // 16x16...
   if (cpi->sf.use_nonrd_pick_mode) {
-    if (q == 0)
-      q++;
     if (cpi->sf.force_frame_boost == 1)
       q -= cpi->sf.max_delta_qindex;
 
@@ -1305,11 +1278,26 @@
           cpi->oxcf.key_freq == 0))) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
+
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
+    }
+
     if (cpi->pass == 0 && cpi->oxcf.rc_mode == RC_MODE_CBR) {
       target = calc_iframe_target_size_one_pass_cbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
+
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+      if (cpi->svc.spatial_layer_id == 0) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+      }
+    }
+
     if (cpi->pass == 0 && cpi->oxcf.rc_mode == RC_MODE_CBR) {
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 8203661..b1cc676 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -20,8 +20,6 @@
 extern "C" {
 #endif
 
-#define FRAME_OVERHEAD_BITS 200
-
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f309aac..22e19fe 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -152,24 +152,23 @@
 }
 
 static void fill_mode_costs(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
   const FRAME_CONTEXT *const fc = &cpi->common.fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++)
     for (j = 0; j < INTRA_MODES; j++)
-      vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+      vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
-  vp9_cost_tokens(x->intra_uv_mode_cost[KEY_FRAME],
+  vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+  vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
                   vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
-  vp9_cost_tokens(x->intra_uv_mode_cost[INTER_FRAME],
+  vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME],
                   fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)x->switchable_interp_costs[i],
+    vp9_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
 }
 
@@ -313,7 +312,7 @@
                                &cm->fc.nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        vp9_cost_tokens((int *)x->inter_mode_cost[i],
+        vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
                         cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
     }
   }
@@ -807,7 +806,7 @@
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX}};
-  int n, m;
+  TX_SIZE n, m;
   int s0, s1;
   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   int64_t best_rd = INT64_MAX;
@@ -890,7 +889,7 @@
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX}};
-  int n, m;
+  TX_SIZE n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -962,7 +961,7 @@
 
   vp9_subtract_plane(x, bs, 0);
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -1000,7 +999,7 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
   assert(bs == mbmi->sb_type);
-  if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
+  if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -1187,7 +1186,7 @@
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
-  const int *bmode_costs = mb->mbmode_cost;
+  const int *bmode_costs = cpi->mbmode_cost;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
@@ -1203,7 +1202,7 @@
         const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
         const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
 
-        bmode_costs  = mb->y_mode_costs[A][L];
+        bmode_costs  = cpi->y_mode_costs[A][L];
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
@@ -1250,7 +1249,7 @@
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
   int i;
-  int *bmode_costs = x->mbmode_cost;
+  int *bmode_costs = cpi->mbmode_cost;
 
   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
     for (i = 0; i < TX_MODES; i++)
@@ -1269,7 +1268,7 @@
       const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
       const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
 
-      bmode_costs = x->y_mode_costs[A][L];
+      bmode_costs = cpi->y_mode_costs[A][L];
     }
     mic->mbmi.mode = mode;
 
@@ -1378,7 +1377,7 @@
     if (this_rate_tokenonly == INT_MAX)
       continue;
     this_rate = this_rate_tokenonly +
-                x->intra_uv_mode_cost[cpi->common.frame_type][mode];
+                cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -1426,7 +1425,7 @@
   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
-  *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
@@ -1460,7 +1459,7 @@
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+    return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   } else {
     return 0;
   }
@@ -1736,7 +1735,6 @@
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize];
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
@@ -1806,9 +1804,8 @@
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
-          int_mv *const new_mv = &mode_mv[NEWMV][0];
+          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int further_steps;
           int thissme, bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
           MV mvp_full;
@@ -1836,8 +1833,8 @@
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
             // the given reference.
-            step_param = (vp9_init_search_range(cpi, max_mv) +
-                          cpi->mv_step_param) >> 1;
+            step_param = (vp9_init_search_range(&cpi->sf, max_mv) +
+                              cpi->mv_step_param) / 2;
           } else {
             step_param = cpi->mv_step_param;
           }
@@ -1851,48 +1848,14 @@
             step_param = MAX(step_param, 8);
           }
 
-          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
           // adjust src pointer for this block
           mi_buf_shift(x, i);
 
           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
-          if (cpi->sf.search_method == HEX) {
-            bestsme = vp9_hex_search(x, &mvp_full,
-                                     step_param,
-                                     sadpb, 1, v_fn_ptr, 1,
-                                     &bsi->ref_mv[0]->as_mv,
-                                     &new_mv->as_mv);
-            if (bestsme < INT_MAX)
-              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
-                                           &bsi->ref_mv[0]->as_mv,
-                                           v_fn_ptr, 1);
-          } else if (cpi->sf.search_method == SQUARE) {
-            bestsme = vp9_square_search(x, &mvp_full,
-                                        step_param,
-                                        sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv[0]->as_mv,
-                                        &new_mv->as_mv);
-            if (bestsme < INT_MAX)
-              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
-                                           &bsi->ref_mv[0]->as_mv,
-                                           v_fn_ptr, 1);
-          } else if (cpi->sf.search_method == BIGDIA) {
-            bestsme = vp9_bigdia_search(x, &mvp_full,
-                                        step_param,
-                                        sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv[0]->as_mv,
-                                        &new_mv->as_mv);
-            if (bestsme < INT_MAX)
-              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
-                                           &bsi->ref_mv[0]->as_mv,
-                                           v_fn_ptr, 1);
-          } else {
-            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                             sadpb, further_steps, 0, v_fn_ptr,
-                                             &bsi->ref_mv[0]->as_mv,
-                                             &new_mv->as_mv);
-          }
+          bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+                                      sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
+                                      INT_MAX, 1);
 
           // Should we do a full search (best quality only)
           if (is_best_mode(cpi->oxcf.mode)) {
@@ -1901,26 +1864,26 @@
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
             thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
+                                           sadpb, 16, &cpi->fn_ptr[bsize],
                                            &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
-              new_mv->as_int = best_mv->as_int;
+              *new_mv = best_mv->as_mv;
             } else {
               // The full search result is actually worse so re-instate the
               // previous best vector
-              best_mv->as_int = new_mv->as_int;
+              best_mv->as_mv = *new_mv;
             }
           }
 
           if (bestsme < INT_MAX) {
             int distortion;
             cpi->find_fractional_mv_step(x,
-                                         &new_mv->as_mv,
+                                         new_mv,
                                          &bsi->ref_mv[0]->as_mv,
                                          cm->allow_high_precision_mv,
-                                         x->errorperbit, v_fn_ptr,
+                                         x->errorperbit, &cpi->fn_ptr[bsize],
                                          cpi->sf.subpel_force_stop,
                                          cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
@@ -1928,11 +1891,11 @@
                                          &x->pred_sse[mbmi->ref_frame[0]]);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame[0]].as_int = new_mv->as_int;
+            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
           }
 
           if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]].as_int = new_mv->as_int;
+            x->pred_mv[mbmi->ref_frame[0]].as_mv = *new_mv;
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -2334,12 +2297,12 @@
   return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
 }
 
-int vp9_get_switchable_rate(const MACROBLOCK *x) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+int vp9_get_switchable_rate(const VP9_COMP *cpi) {
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
-             x->switchable_interp_costs[ctx][mbmi->interp_filter];
+             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2351,7 +2314,7 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
-  int further_steps, step_param;
+  int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
@@ -2389,8 +2352,8 @@
     // Take wtd average of the step_params based on the last frame's
     // max mv magnitude and that based on the best ref mvs of the current
     // block for the given reference.
-    step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
-                  cpi->mv_step_param) >> 1;
+    step_param = (vp9_init_search_range(&cpi->sf, x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
   } else {
     step_param = cpi->mv_step_param;
   }
@@ -2431,50 +2394,8 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  // Further step/diamond searches as necessary
-  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-  if (cpi->sf.search_method == FAST_DIAMOND) {
-    bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
-                                  &cpi->fn_ptr[bsize], 1,
-                                  &ref_mv, &tmp_mv->as_mv);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
-                                   &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == FAST_HEX) {
-    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
-                                  &cpi->fn_ptr[bsize], 1,
-                                  &ref_mv, &tmp_mv->as_mv);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
-                                   &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == HEX) {
-    bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
-                             &cpi->fn_ptr[bsize], 1,
-                             &ref_mv, &tmp_mv->as_mv);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
-                                   &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == SQUARE) {
-    bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
-                                &cpi->fn_ptr[bsize], 1,
-                                &ref_mv, &tmp_mv->as_mv);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
-                                   &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == BIGDIA) {
-    bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
-                                &cpi->fn_ptr[bsize], 1,
-                                &ref_mv, &tmp_mv->as_mv);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
-                                   &cpi->fn_ptr[bsize], 1);
-  } else {
-    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                     sadpb, further_steps, 1,
-                                     &cpi->fn_ptr[bsize],
-                                     &ref_mv, &tmp_mv->as_mv);
-  }
+  bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                              &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -2788,7 +2709,7 @@
         int j;
         int64_t rs_rd;
         mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(x);
+        rs = vp9_get_switchable_rate(cpi);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
         if (i > 0 && intpel_mv) {
@@ -2858,7 +2779,7 @@
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : *best_filter;
-  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(x) : 0;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2888,7 +2809,7 @@
   }
 
   if (cm->interp_filter == SWITCHABLE)
-    *rate2 += vp9_get_switchable_rate(x);
+    *rate2 += vp9_get_switchable_rate(cpi);
 
   if (!is_comp_pred) {
     if (!x->in_active_map) {
@@ -3250,9 +3171,8 @@
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      const int altref_zero_mask =
+      mode_skip_mask =
           ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
-      mode_skip_mask |= altref_zero_mask;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
         mode_skip_mask |= (1 << THR_NEARA);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
@@ -3410,12 +3330,6 @@
     for (i = 0; i < TX_MODES; ++i)
       tx_cache[i] = INT64_MAX;
 
-#ifdef MODE_TEST_HIT_STATS
-    // TEST/DEBUG CODE
-    // Keep a rcord of the number of test hits at each size
-    cpi->mode_test_hits[bsize]++;
-#endif
-
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
@@ -3436,7 +3350,7 @@
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -3954,12 +3868,6 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#ifdef MODE_TEST_HIT_STATS
-    // TEST/DEBUG CODE
-    // Keep a rcord of the number of test hits at each size
-    cpi->mode_test_hits[bsize]++;
-#endif
-
     if (ref_frame == INTRA_FRAME) {
       int rate;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
@@ -4035,7 +3943,7 @@
 
             if (tmp_rd == INT64_MAX)
               continue;
-            rs = vp9_get_switchable_rate(x);
+            rs = vp9_get_switchable_rate(cpi);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
             rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
             rd_opt->filter_cache[SWITCHABLE_FILTERS] =
@@ -4113,7 +4021,7 @@
       distortion2 += distortion;
 
       if (cm->interp_filter == SWITCHABLE)
-        rate2 += vp9_get_switchable_rate(x);
+        rate2 += vp9_get_switchable_rate(cpi);
 
       if (!mode_excluded)
         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index cd622d6..b6b51e5 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -40,7 +40,7 @@
                                   unsigned int qstep, int *rate,
                                   int64_t *dist);
 
-int vp9_get_switchable_rate(const MACROBLOCK *x);
+int vp9_get_switchable_rate(const VP9_COMP *cpi);
 
 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                             const TileInfo *const tile,
@@ -87,6 +87,49 @@
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
 
+static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, MV *mvp_full,
+                                    int step_param, int error_per_bit,
+                                    const MV *ref_mv, MV *tmp_mv,
+                                    int var_max, int rd) {
+  int var = 0;
+
+  if (cpi->sf.search_method == FAST_DIAMOND) {
+    var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == FAST_HEX) {
+    var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == HEX) {
+    var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                         &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == SQUARE) {
+    var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
+                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == BIGDIA) {
+    var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else {
+    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+    var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                                 further_steps, 1, &cpi->fn_ptr[bsize],
+                                 ref_mv, tmp_mv);
+  }
+
+  return var;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index cff99a6..46806c9 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -129,7 +129,7 @@
   ONE_LOOP_REDUCED = 2
 } FAST_COEFF_UPDATE;
 
-typedef struct {
+typedef struct SPEED_FEATURES {
   // Frame level coding parameter update
   int frame_parameter_update;
 
@@ -176,7 +176,7 @@
   // a log search that iterates 4 times (check around mv for last for best
   // error of combined predictor then check around mv for alt). If 0 we
   // we just use the best motion vector found for each frame by itself.
-  int comp_inter_joint_search_thresh;
+  BLOCK_SIZE comp_inter_joint_search_thresh;
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 2379f35..2e98fa7 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -220,3 +220,10 @@
       : &svc->layer_context[svc->spatial_layer_id];
   ++lc->current_video_frame_in_layer;
 }
+
+int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
+  return cpi->use_svc &&
+         cpi->svc.number_temporal_layers == 1 &&
+         cpi->svc.spatial_layer_id > 0 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 2abed30..74d9c1c 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -30,6 +30,7 @@
   struct twopass_rc twopass;
   struct vpx_fixed_buf rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
+  int is_key_frame;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -73,6 +74,9 @@
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(SVC *svc);
 
+// Check if current layer is key frame in spatial upper layer
+int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ca93391..6eff200 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -27,6 +27,8 @@
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
 
+static int fixed_divide[512];
+
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
                                             uint8_t *u_mb_ptr,
@@ -78,6 +80,14 @@
                             kernel, mv_precision_uv, x, y);
 }
 
+void vp9_temporal_filter_init() {
+  int i;
+
+  fixed_divide[0] = 0;
+  for (i = 1; i < 512; ++i)
+    fixed_divide[i] = 0x80000 / i;
+}
+
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
                                  unsigned int stride,
                                  uint8_t *frame2,
@@ -89,6 +99,7 @@
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
 
   for (i = 0, k = 0; i < block_size; i++) {
     for (j = 0; j < block_size; j++, k++) {
@@ -101,7 +112,7 @@
       // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
       modifier  *= modifier;
       modifier  *= 3;
-      modifier  += 1 << (strength - 1);
+      modifier  += rounding;
       modifier >>= strength;
 
       if (modifier > 16)
@@ -294,7 +305,7 @@
       for (i = 0, k = 0; i < 16; i++) {
         for (j = 0; j < 16; j++, k++) {
           unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
+          pval *= fixed_divide[count[k]];
           pval >>= 19;
 
           dst1[byte] = (uint8_t)pval;
@@ -315,13 +326,13 @@
 
           // U
           unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
+          pval *= fixed_divide[count[k]];
           pval >>= 19;
           dst1[byte] = (uint8_t)pval;
 
           // V
           pval = accumulator[m] + (count[m] >> 1);
-          pval *= cpi->fixed_divide[count[m]];
+          pval *= fixed_divide[count[m]];
           pval >>= 19;
           dst2[byte] = (uint8_t)pval;
 
@@ -345,33 +356,74 @@
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
   int frame = 0;
+  int frames_to_blur_backward = 0;
+  int frames_to_blur_forward = 0;
   int frames_to_blur = 0;
   int start_frame = 0;
   int strength = cpi->active_arnr_strength;
+  int blur_type = cpi->oxcf.arnr_type;
   int max_frames = cpi->active_arnr_frames;
-  int frames_to_blur_backward = distance;
-  int frames_to_blur_forward = vp9_lookahead_depth(cpi->lookahead)
-                                   - (distance + 1);
+  const int num_frames_backward = distance;
+  const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
+                               - (num_frames_backward + 1);
   struct scale_factors sf;
 
-  // Determine which input frames to filter.
-  if (frames_to_blur_forward > frames_to_blur_backward)
-    frames_to_blur_forward = frames_to_blur_backward;
+  switch (blur_type) {
+    case 1:
+      // Backward Blur
+      frames_to_blur_backward = num_frames_backward;
 
-  if (frames_to_blur_backward > frames_to_blur_forward)
-    frames_to_blur_backward = frames_to_blur_forward;
+      if (frames_to_blur_backward >= max_frames)
+        frames_to_blur_backward = max_frames - 1;
 
-  // When max_frames is even we have 1 more frame backward than forward
-  if (frames_to_blur_forward > (max_frames - 1) / 2)
-    frames_to_blur_forward = (max_frames - 1) / 2;
+      frames_to_blur = frames_to_blur_backward + 1;
+      break;
 
-  if (frames_to_blur_backward > (max_frames / 2))
-    frames_to_blur_backward = max_frames / 2;
+    case 2:
+      // Forward Blur
+      frames_to_blur_forward = num_frames_forward;
 
-  frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+      if (frames_to_blur_forward >= max_frames)
+        frames_to_blur_forward = max_frames - 1;
+
+      frames_to_blur = frames_to_blur_forward + 1;
+      break;
+
+    case 3:
+    default:
+      // Center Blur
+      frames_to_blur_forward = num_frames_forward;
+      frames_to_blur_backward = num_frames_backward;
+
+      if (frames_to_blur_forward > frames_to_blur_backward)
+        frames_to_blur_forward = frames_to_blur_backward;
+
+      if (frames_to_blur_backward > frames_to_blur_forward)
+        frames_to_blur_backward = frames_to_blur_forward;
+
+      // When max_frames is even we have 1 more frame backward than forward
+      if (frames_to_blur_forward > (max_frames - 1) / 2)
+        frames_to_blur_forward = ((max_frames - 1) / 2);
+
+      if (frames_to_blur_backward > (max_frames / 2))
+        frames_to_blur_backward = (max_frames / 2);
+
+      frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+      break;
+  }
 
   start_frame = distance + frames_to_blur_forward;
 
+#ifdef DEBUGFWG
+  // DEBUG FWG
+  printf(
+      "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d "
+      "start:%d",
+      max_frames, num_frames_backward, num_frames_forward, frames_to_blur,
+      frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index,
+      cpi->last_alt_ref_sei, start_frame);
+#endif
+
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
   vp9_setup_scale_factors_for_frame(&sf,
       get_frame_new_buffer(cm)->y_crop_width,
@@ -380,7 +432,7 @@
 
   // Setup frame pointers, NULL indicates frame not included in filter
   vp9_zero(cpi->frames);
-  for (frame = 0; frame < frames_to_blur; ++frame) {
+  for (frame = 0; frame < frames_to_blur; frame++) {
     int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
                                                      which_buffer);
@@ -394,11 +446,11 @@
 void vp9_configure_arnr_filter(VP9_COMP *cpi,
                                const unsigned int frames_to_arnr,
                                const int group_boost) {
-  int q;
   int half_gf_int;
   int frames_after_arf;
-  int frames_bwd;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+  int q;
 
   // Define the arnr filter width for this group of frames. We only
   // filter frames that lie within a distance of half the GF interval
@@ -410,26 +462,47 @@
   frames_after_arf = vp9_lookahead_depth(cpi->lookahead)
       - frames_to_arnr - 1;
 
-  if (frames_fwd > frames_after_arf)
-    frames_fwd = frames_after_arf;
-  if (frames_fwd > half_gf_int)
-    frames_fwd = half_gf_int;
+  switch (cpi->oxcf.arnr_type) {
+    case 1:  // Backward filter
+      frames_fwd = 0;
+      if (frames_bwd > half_gf_int)
+        frames_bwd = half_gf_int;
+      break;
 
-  frames_bwd = frames_fwd;
+    case 2:  // Forward filter
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      frames_bwd = 0;
+      break;
 
-  // For even length filter there is one more frame backward
-  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-  if (frames_bwd < half_gf_int)
-    frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+    case 3:  // Centered filter
+    default:
+      frames_fwd >>= 1;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+
+      frames_bwd = frames_fwd;
+
+      // For even length filter there is one more frame backward
+      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+      if (frames_bwd < half_gf_int)
+        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+      break;
+  }
 
   cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q
   if (cpi->common.current_video_frame > 1)
-    q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME]));
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[INTER_FRAME]));
   else
-    q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME]));
-
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[KEY_FRAME]));
   if (q > 16) {
     cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
   } else {
diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h
index 3028d78..9453dc1 100644
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -15,6 +15,7 @@
 extern "C" {
 #endif
 
+void vp9_temporal_filter_init();
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
 void vp9_configure_arnr_filter(VP9_COMP *cpi,
                                const unsigned int frames_to_arnr,
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 520ee44..91d8ea4 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -113,12 +113,9 @@
 unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
                                        const uint8_t *b, int b_stride, \
                                        unsigned int *sse) { \
-  unsigned int var; \
-  int avg; \
-\
-  variance(a, a_stride, b, b_stride, W, H, &var, &avg); \
-  *sse = var; \
-  return var - (((int64_t)avg * avg) / (W * H)); \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 
 #define SUBPIX_VAR(W, H) \
@@ -159,69 +156,36 @@
   return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
-
-void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
-}
-
-void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
-                       const uint8_t *ref_ptr, int ref_stride,
-                       unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr,
-                            int  source_stride,
-                            const uint8_t *ref_ptr,
-                            int  recon_stride,
+unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
+                            const uint8_t *ref, int ref_stride,
                             unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-  *sse = var;
-  return var;
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
+  return *sse;
 }
 
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,
-                           int  source_stride,
-                           const uint8_t *ref_ptr,
-                           int  recon_stride,
+unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
                            unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-  *sse = var;
-  return var;
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
+  return *sse;
 }
 
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,
-                           int  source_stride,
-                           const uint8_t *ref_ptr,
-                           int  recon_stride,
+unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
                            unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-  *sse = var;
-  return var;
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
+  return *sse;
 }
 
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,
-                          int  source_stride,
-                          const uint8_t *ref_ptr,
-                          int  recon_stride,
+unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
                           unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-  *sse = var;
-  return var;
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
+  return *sse;
 }
 
 VAR(4, 4)
@@ -276,87 +240,6 @@
 SUBPIX_VAR(64, 64)
 SUBPIX_AVG_VAR(64, 64)
 
-unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                        int height, const uint8_t *ref, int ref_stride) {
   int i, j;
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4c8be71..c47fe13 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -69,22 +69,12 @@
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
 
-typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
-                                                   int source_stride,
-                                                   const uint8_t *ref_ptr,
-                                                   int  ref_stride);
-
 typedef struct vp9_variance_vtable {
   vp9_sad_fn_t               sdf;
   vp9_sad_avg_fn_t           sdaf;
   vp9_variance_fn_t          vf;
   vp9_subpixvariance_fn_t    svf;
   vp9_subp_avg_variance_fn_t svaf;
-  vp9_variance_fn_t          svf_halfpix_h;
-  vp9_variance_fn_t          svf_halfpix_v;
-  vp9_variance_fn_t          svf_halfpix_hv;
   vp9_sad_multi_fn_t         sdx3f;
   vp9_sad_multi_fn_t         sdx8f;
   vp9_sad_multi_d_fn_t       sdx4df;
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 2d59775..42fdbbd 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -12,6 +12,9 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32(b, a, b, a)
+
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
new file mode 100644
index 0000000..f71181c
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  movq            m4,        m0
+  movq            m5,        m2
+  punpcklwd       m4,        m1
+  punpckhwd       m0,        m1
+  punpcklwd       m5,        m3
+  punpckhwd       m2,        m3
+  movq            m1,        m4
+  movq            m3,        m0
+  punpckldq       m1,        m5
+  punpckhdq       m4,        m5
+  punpckldq       m3,        m2
+  punpckhdq       m0,        m2
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+INIT_MMX mmx
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+  psllw           m2,        2
+  psllw           m3,        2
+
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m1
+  movq            [outputq + 16], m2
+  movq            [outputq + 24], m3
+
+  RET
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.asm b/vp9/encoder/x86/vp9_dct_ssse3.asm
new file mode 100644
index 0000000..8723a71
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_ssse3.asm
@@ -0,0 +1,174 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 0
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+  SUM_SUB            0,  1,  9
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
deleted file mode 100644
index 2ecc23e..0000000
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ /dev/null
@@ -1,337 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-.half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance16x_h_1    ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-.half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             .half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-.half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance16x_h_1         ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 2c50881..4830412 100644
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -398,337 +398,4 @@
     pop         rbp
     ret
 
-;void vp9_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp9_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-.half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp9_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-.half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             .half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp9_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-.half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
 
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 9e65694..41f2259 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -42,66 +42,6 @@
   unsigned int *SSE,
   int *Sum
 );
-void vp9_half_horiz_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
 
 typedef unsigned int (*get_var_sse2) (
   const unsigned char *src_ptr,
@@ -494,58 +434,3 @@
 
 #undef FNS
 #undef FN
-
-unsigned int vp9_variance_halfpixvar16x16_h_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  vp9_half_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b1ba0b1..3b4d6b9 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -120,28 +120,32 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon$(ASM)
+ifeq ($(ARCH_X86_64), yes)
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3.asm
+endif
+
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f7c0a64..c4058bb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -30,6 +30,7 @@
   unsigned int                tile_rows;
   unsigned int                arnr_max_frames;
   unsigned int                arnr_strength;
+  unsigned int                arnr_type;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
@@ -41,7 +42,7 @@
 };
 
 struct extraconfig_map {
-  int usage;
+  unsigned int usage;
   struct vp9_extracfg cfg;
 };
 
@@ -59,6 +60,7 @@
       0,                          // tile_rows
       7,                          // arnr_max_frames
       5,                          // arnr_strength
+      3,                          // arnr_type
       VP8_TUNE_PSNR,              // tuning
       10,                         // cq_level
       0,                          // rc_max_intra_bitrate_pct
@@ -201,6 +203,7 @@
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK(extra_cfg, arnr_type, 1, 3);
   RANGE_CHECK(extra_cfg, cq_level, 0, 63);
 
   // TODO(yaowu): remove this when ssim tuning is implemented for vp9
@@ -242,7 +245,8 @@
         layer_id = (int)stats->spatial_layer_id;
 
         if (layer_id >= cfg->ss_number_layers
-            ||(int)(stats->count + 0.5) != n_packets_per_layer[layer_id] - 1)
+            ||(unsigned int)(stats->count + 0.5) !=
+               n_packets_per_layer[layer_id] - 1)
           ERROR("rc_twopass_stats_in missing EOS stats packet");
       }
     } else {
@@ -364,6 +368,7 @@
 
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
+  oxcf->arnr_type       = extra_cfg->arnr_type;
 
   oxcf->tuning = extra_cfg->tuning;
 
@@ -494,6 +499,7 @@
     MAP(VP9E_SET_TILE_ROWS,               extra_cfg.tile_rows);
     MAP(VP8E_SET_ARNR_MAXFRAMES,          extra_cfg.arnr_max_frames);
     MAP(VP8E_SET_ARNR_STRENGTH,           extra_cfg.arnr_strength);
+    MAP(VP8E_SET_ARNR_TYPE,               extra_cfg.arnr_type);
     MAP(VP8E_SET_TUNING,                  extra_cfg.tuning);
     MAP(VP8E_SET_CQ_LEVEL,                extra_cfg.cq_level);
     MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT,   extra_cfg.rc_max_intra_bitrate_pct);
@@ -1107,6 +1113,7 @@
   {VP9E_SET_TILE_ROWS,                ctrl_set_param},
   {VP8E_SET_ARNR_MAXFRAMES,           ctrl_set_param},
   {VP8E_SET_ARNR_STRENGTH,            ctrl_set_param},
+  {VP8E_SET_ARNR_TYPE,                ctrl_set_param},
   {VP8E_SET_TUNING,                   ctrl_set_param},
   {VP8E_SET_CQ_LEVEL,                 ctrl_set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_param},
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 06b4823..6198250 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -245,17 +245,13 @@
 }
 
 static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  VP9DecoderConfig oxcf;
-  oxcf.width = ctx->si.w;
-  oxcf.height = ctx->si.h;
-  oxcf.version = 9;
-  oxcf.max_threads = ctx->cfg.threads;
-  oxcf.inv_tile_order = ctx->invert_tile_order;
-
-  ctx->pbi = vp9_decoder_create(&oxcf);
+  ctx->pbi = vp9_decoder_create();
   if (ctx->pbi == NULL)
     return;
 
+  ctx->pbi->max_threads = ctx->cfg.threads;
+  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
+
   vp9_initialize_dec();
 
   // If postprocessing was enabled by the application and a
@@ -375,80 +371,70 @@
   }
 }
 
+static vpx_codec_err_t decode_one_iter(vpx_codec_alg_priv_t *ctx,
+                                       const uint8_t **data_start_ptr,
+                                       const uint8_t *data_end,
+                                       uint32_t frame_size, void *user_priv,
+                                       long deadline) {
+  const vpx_codec_err_t res = decode_one(ctx, data_start_ptr, frame_size,
+                                         user_priv, deadline);
+  if (res != VPX_CODEC_OK)
+    return res;
+
+  // Account for suboptimal termination by the encoder.
+  while (*data_start_ptr < data_end) {
+    const uint8_t marker = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
+                                       *data_start_ptr);
+    if (marker)
+      break;
+    (*data_start_ptr)++;
+  }
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
   const uint8_t *data_start = data;
-  const uint8_t *data_end = data + data_sz;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  uint32_t sizes[8];
-  int frames_this_pts, frame_count = 0;
+  const uint8_t *const data_end = data + data_sz;
+  vpx_codec_err_t res;
+  uint32_t frame_sizes[8];
+  int frame_count;
 
   if (data == NULL || data_sz == 0)
     return VPX_CODEC_INVALID_PARAM;
 
-  parse_superframe_index(data, data_sz, sizes, &frames_this_pts,
+  parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
                          ctx->decrypt_cb, ctx->decrypt_state);
 
-  do {
-    if (data_sz) {
-      uint8_t marker = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
-                                   data_start);
-      // Skip over the superframe index, if present
-      if ((marker & 0xe0) == 0xc0) {
-        const uint32_t frames = (marker & 0x7) + 1;
-        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-        const uint32_t index_sz = 2 + mag * frames;
+  if (frame_count > 0) {
+    int i;
 
-        if (data_sz >= index_sz) {
-          uint8_t marker2 = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
-                                        data_start + index_sz - 1);
-          if (marker2 == marker) {
-            data_start += index_sz;
-            data_sz -= index_sz;
-            if (data_start < data_end)
-              continue;
-            else
-              break;
-          }
-        }
-      }
-    }
-
-    // Use the correct size for this frame, if an index is present.
-    if (frames_this_pts) {
-      uint32_t this_sz = sizes[frame_count];
-
-      if (data_sz < this_sz) {
+    for (i = 0; i < frame_count; ++i) {
+      const uint32_t frame_size = frame_sizes[i];
+      if (data_start < data ||
+          frame_size > (uint32_t)(data_end - data_start)) {
         ctx->base.err_detail = "Invalid frame size in index";
         return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      data_sz = this_sz;
-      frame_count++;
+      res = decode_one_iter(ctx, &data_start, data_end, frame_size,
+                            user_priv, deadline);
+      if (res != VPX_CODEC_OK)
+        return res;
     }
-
-    res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
-    assert(data_start >= data);
-    assert(data_start <= data_end);
-
-    // Early exit if there was a decode error
-    if (res)
-      break;
-
-    // Account for suboptimal termination by the encoder.
+  } else {
     while (data_start < data_end) {
-      uint8_t marker3 = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
-                                    data_start);
-      if (marker3)
-        break;
-      data_start++;
+      res = decode_one_iter(ctx, &data_start, data_end,
+                            (uint32_t)(data_end - data_start),
+                            user_priv, deadline);
+      if (res != VPX_CODEC_OK)
+        return res;
     }
+  }
 
-    data_sz = (unsigned int)(data_end - data_start);
-  } while (data_start < data_end);
-
-  return res;
+  return VPX_CODEC_OK;
 }
 
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 58256b2..d60883c 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -16,9 +16,11 @@
     * the Y, U, and V planes, nor other alignment adjustments that
     * might be representable by a YV12_BUFFER_CONFIG, so we just
     * initialize all the fields.*/
-  int bps = 12;
-  if (yv12->uv_height == yv12->y_height) {
-    if (yv12->uv_width == yv12->y_width) {
+  const int ss_x = yv12->uv_crop_width < yv12->y_crop_width;
+  const int ss_y = yv12->uv_crop_height < yv12->y_crop_height;
+  int bps;
+  if (!ss_y) {
+    if (!ss_x) {
       img->fmt = VPX_IMG_FMT_I444;
       bps = 24;
     } else {
@@ -27,13 +29,14 @@
     }
   } else {
     img->fmt = VPX_IMG_FMT_I420;
+    bps = 12;
   }
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
-  img->x_chroma_shift = yv12->uv_width < yv12->y_width;
-  img->y_chroma_shift = yv12->uv_height < yv12->y_height;
+  img->x_chroma_shift = ss_x;
+  img->y_chroma_shift = ss_y;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c444fe4..bc9a478 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -96,12 +96,12 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 
 ifeq ($(CONFIG_USE_X86INC),yes)
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
@@ -112,6 +112,7 @@
 
 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.asm
 endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 76aacd2..b874be7 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -234,7 +234,8 @@
   si->message_buffer[0] = '\0';
 }
 
-static int svc_log(SvcContext *svc_ctx, int level, const char *fmt, ...) {
+static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
+                   const char *fmt, ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
@@ -1000,8 +1001,10 @@
               (int)si->frame_size, (int)pts);
     }
   }
-  ++si->frame_within_gop;
-  ++si->encode_frame_count;
+  if (rawimg != NULL) {
+    ++si->frame_within_gop;
+    ++si->encode_frame_count;
+  }
 
   return VPX_CODEC_OK;
 }
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 8944a26..67cbdb1 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -160,8 +160,12 @@
                                           scale as used by the rc_*_quantizer config
                                           parameters */
   VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
-  VP8E_SET_ARNR_STRENGTH,         /**< control function to set the filter strength for the arf */
-  VP8E_SET_ARNR_TYPE,         /**< control function to set the type of filter to use for the arf*/
+  VP8E_SET_ARNR_STRENGTH,          //!< control function to set the filter
+                                   //!< strength for the arf
+
+  /*!\deprecated control function to set the filter type to use for the arf */
+  VP8E_SET_ARNR_TYPE,
+
   VP8E_SET_TUNING,                 /**< control function to set visual tuning */
   /*!\brief control function to set constrained quality level
    *
@@ -347,7 +351,7 @@
 
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH,     unsigned int)
-VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE,     unsigned int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vp8e_tuning */
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL,      unsigned int)
 
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 8d0f4ec..d45b003 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -34,7 +34,7 @@
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
 #define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U plane in memory */
 #define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel component */
-
+#define VPX_IMG_FMT_HIGH       0x800  /**< Image uses 16bit framebuffer */
 
   /*!\brief List of supported image formats */
   typedef enum vpx_img_fmt {
@@ -58,7 +58,10 @@
     VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
     VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
     VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
-    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
+    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH,
+    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH,
+    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index 542ff67..fa0e030 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -12,6 +12,13 @@
 #include <string.h>
 #include "arm.h"
 
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
 static int arm_cpu_env_flags(int *flags) {
   char *env;
   env = getenv("VPX_SIMD_CAPS");
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index ded8e0b..95e7483 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -10,10 +10,10 @@
 SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
 #neon
-SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON)  += arm/neon/yv12extend_arm.c
+SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
+SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
+SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/yv12extend_arm.c
 
 #mips(dspr2)
 SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 8c92570..2e3f1ff 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -17,10 +17,12 @@
 }
 
 add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
-specialize qw/vp8_yv12_extend_frame_borders neon/;
+specialize qw/vp8_yv12_extend_frame_borders neon_asm/;
+$vp8_yv12_extend_frame_borders_neon_asm=vp8_yv12_extend_frame_borders_neon;
 
 add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_frame neon/;
+specialize qw/vp8_yv12_copy_frame neon_asm/;
+$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon;
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
diff --git a/vpxdec.c b/vpxdec.c
index 6356961..ed37c70 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -33,7 +33,9 @@
 #include "./md5_utils.h"
 
 #include "./tools_common.h"
+#if CONFIG_WEBM_IO
 #include "./webmdec.h"
+#endif
 #include "./y4menc.h"
 
 static const char *exec_name;
@@ -528,9 +530,11 @@
 
   struct VpxDecInputContext input = {0};
   struct VpxInputContext vpx_input_ctx = {0};
+#if CONFIG_WEBM_IO
   struct WebmInputContext webm_ctx = {0};
-  input.vpx_input_ctx = &vpx_input_ctx;
   input.webm_ctx = &webm_ctx;
+#endif
+  input.vpx_input_ctx = &vpx_input_ctx;
 
   /* Parse command line */
   exec_name = argv_[0];
diff --git a/vpxenc.c b/vpxenc.c
index 3b8591e..96a7ab6 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -42,7 +42,9 @@
 #include "./rate_hist.h"
 #include "./vpxstats.h"
 #include "./warnings.h"
+#if CONFIG_WEBM_IO
 #include "./webmenc.h"
+#endif
 #include "./y4minput.h"
 
 /* Swallow warnings about unused results of fread/fwrite */
@@ -207,6 +209,7 @@
                                                   "Frame width");
 static const arg_def_t height           = ARG_DEF("h", "height", 1,
                                                   "Frame height");
+#if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
   {"mono", STEREO_FORMAT_MONO},
   {"left-right", STEREO_FORMAT_LEFT_RIGHT},
@@ -217,6 +220,7 @@
 };
 static const arg_def_t stereo_mode      = ARG_DEF_ENUM(NULL, "stereo-mode", 1,
                                                        "Stereo 3D video format", stereo_mode_enum);
+#endif
 static const arg_def_t timebase         = ARG_DEF(NULL, "timebase", 1,
                                                   "Output timestamp precision (fractional seconds)");
 static const arg_def_t error_resilient  = ARG_DEF(NULL, "error-resilient", 1,
@@ -226,7 +230,11 @@
 
 static const arg_def_t *global_args[] = {
   &use_yv12, &use_i420, &usage, &threads, &profile,
-  &width, &height, &stereo_mode, &timebase, &framerate,
+  &width, &height,
+#if CONFIG_WEBM_IO
+  &stereo_mode,
+#endif
+  &timebase, &framerate,
   &error_resilient,
   &lag_in_frames, NULL
 };
@@ -363,7 +371,7 @@
 
 static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
-  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength,
+  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
   &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   NULL
@@ -372,7 +380,7 @@
   VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
   VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
   VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
-  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH,
+  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST,
@@ -554,6 +562,11 @@
                              NELEMENTS(vp9_arg_ctrl_map))
 #endif
 
+#if !CONFIG_WEBM_IO
+typedef int stereo_format_t;
+struct EbmlGlobal { int debug; };
+#endif
+
 /* Per-stream configuration */
 struct stream_config {
   struct vpx_codec_enc_cfg  cfg;
@@ -792,9 +805,9 @@
     stream->config.cfg.g_h = 0;
 
     /* Initialize remaining stream parameters */
-    stream->config.stereo_fmt = STEREO_FORMAT_MONO;
     stream->config.write_webm = 1;
 #if CONFIG_WEBM_IO
+    stream->config.stereo_fmt = STEREO_FORMAT_MONO;
     stream->ebml.last_pts_ns = -1;
     stream->ebml.writer = NULL;
     stream->ebml.segment = NULL;
@@ -869,8 +882,10 @@
       config->cfg.g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_WEBM_IO
     } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
+#endif
     } else if (arg_match(&arg, &timebase, argi)) {
       config->cfg.g_timebase = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
diff --git a/webmdec.c b/webmdec.c
deleted file mode 100644
index 93a8d9f..0000000
--- a/webmdec.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./webmdec.h"
-
-#include <stdarg.h>
-
-#include "third_party/nestegg/include/nestegg/nestegg.h"
-
-static int nestegg_read_cb(void *buffer, size_t length, void *userdata) {
-  FILE *f = userdata;
-
-  if (fread(buffer, 1, length, f) < length) {
-    if (ferror(f))
-      return -1;
-    if (feof(f))
-      return 0;
-  }
-  return 1;
-}
-
-static int nestegg_seek_cb(int64_t offset, int whence, void *userdata) {
-  switch (whence) {
-    case NESTEGG_SEEK_SET:
-      whence = SEEK_SET;
-      break;
-    case NESTEGG_SEEK_CUR:
-      whence = SEEK_CUR;
-      break;
-    case NESTEGG_SEEK_END:
-      whence = SEEK_END;
-      break;
-  };
-  return fseek(userdata, (int32_t)offset, whence) ? -1 : 0;
-}
-
-static int64_t nestegg_tell_cb(void *userdata) {
-  return ftell(userdata);
-}
-
-static void nestegg_log_cb(nestegg *context,
-                           unsigned int severity,
-                           char const *format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  vfprintf(stderr, format, ap);
-  fprintf(stderr, "\n");
-  va_end(ap);
-}
-
-int file_is_webm(struct WebmInputContext *webm_ctx,
-                 struct VpxInputContext *vpx_ctx) {
-  uint32_t i, n;
-  int track_type = -1;
-  int codec_id;
-
-  nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
-  nestegg_video_params params;
-
-  io.userdata = vpx_ctx->file;
-  if (nestegg_init(&webm_ctx->nestegg_ctx, io, NULL, -1))
-    goto fail;
-
-  if (nestegg_track_count(webm_ctx->nestegg_ctx, &n))
-    goto fail;
-
-  for (i = 0; i < n; i++) {
-    track_type = nestegg_track_type(webm_ctx->nestegg_ctx, i);
-
-    if (track_type == NESTEGG_TRACK_VIDEO)
-      break;
-    else if (track_type < 0)
-      goto fail;
-  }
-
-  codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
-  if (codec_id == NESTEGG_CODEC_VP8) {
-    vpx_ctx->fourcc = VP8_FOURCC;
-  } else if (codec_id == NESTEGG_CODEC_VP9) {
-    vpx_ctx->fourcc = VP9_FOURCC;
-  } else {
-    fprintf(stderr, "Not VPx video, quitting.\n");
-    goto fail;
-  }
-
-  webm_ctx->video_track = i;
-
-  if (nestegg_track_video_params(webm_ctx->nestegg_ctx, i, &params))
-    goto fail;
-
-  vpx_ctx->framerate.denominator = 0;
-  vpx_ctx->framerate.numerator = 0;
-  vpx_ctx->width = params.width;
-  vpx_ctx->height = params.height;
-
-  return 1;
-
- fail:
-  webm_ctx->nestegg_ctx = NULL;
-  rewind(vpx_ctx->file);
-
-  return 0;
-}
-
-int webm_read_frame(struct WebmInputContext *webm_ctx,
-                    uint8_t **buffer,
-                    size_t *bytes_in_buffer,
-                    size_t *buffer_size) {
-  if (webm_ctx->chunk >= webm_ctx->chunks) {
-    uint32_t track;
-    int status;
-
-    do {
-      /* End of this packet, get another. */
-      if (webm_ctx->pkt) {
-        nestegg_free_packet(webm_ctx->pkt);
-        webm_ctx->pkt = NULL;
-      }
-
-      status = nestegg_read_packet(webm_ctx->nestegg_ctx, &webm_ctx->pkt);
-      if (status <= 0)
-        return status ? status : 1;
-
-      if (nestegg_packet_track(webm_ctx->pkt, &track))
-        return -1;
-    } while (track != webm_ctx->video_track);
-
-    if (nestegg_packet_count(webm_ctx->pkt, &webm_ctx->chunks))
-      return -1;
-
-    webm_ctx->chunk = 0;
-  }
-
-  if (nestegg_packet_data(webm_ctx->pkt, webm_ctx->chunk,
-                          buffer, bytes_in_buffer)) {
-    return -1;
-  }
-
-  webm_ctx->chunk++;
-  return 0;
-}
-
-int webm_guess_framerate(struct WebmInputContext *webm_ctx,
-                         struct VpxInputContext *vpx_ctx) {
-  uint32_t i;
-  uint64_t tstamp = 0;
-
-  /* Check to see if we can seek before we parse any data. */
-  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0)) {
-    fprintf(stderr, "Failed to guess framerate (no Cues), set to 30fps.\n");
-    vpx_ctx->framerate.numerator = 30;
-    vpx_ctx->framerate.denominator  = 1;
-    return 0;
-  }
-
-  /* Guess the framerate. Read up to 1 second, or 50 video packets,
-   * whichever comes first.
-   */
-  for (i = 0; tstamp < 1000000000 && i < 50;) {
-    nestegg_packet *pkt;
-    uint32_t track;
-
-    if (nestegg_read_packet(webm_ctx->nestegg_ctx, &pkt) <= 0)
-      break;
-
-    nestegg_packet_track(pkt, &track);
-    if (track == webm_ctx->video_track) {
-      nestegg_packet_tstamp(pkt, &tstamp);
-      ++i;
-    }
-
-    nestegg_free_packet(pkt);
-  }
-
-  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0))
-    goto fail;
-
-  vpx_ctx->framerate.numerator = (i - 1) * 1000000;
-  vpx_ctx->framerate.denominator = (int)(tstamp / 1000);
-  return 0;
-
- fail:
-  nestegg_destroy(webm_ctx->nestegg_ctx);
-  webm_ctx->nestegg_ctx = NULL;
-  rewind(vpx_ctx->file);
-  return 1;
-}
-
-void webm_free(struct WebmInputContext *webm_ctx) {
-  if (webm_ctx && webm_ctx->nestegg_ctx) {
-    if (webm_ctx->pkt)
-      nestegg_free_packet(webm_ctx->pkt);
-    nestegg_destroy(webm_ctx->nestegg_ctx);
-  }
-}
diff --git a/webmdec.cc b/webmdec.cc
new file mode 100644
index 0000000..4383e8e
--- /dev/null
+++ b/webmdec.cc
@@ -0,0 +1,219 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./webmdec.h"
+
+#include <cstring>
+#include <cstdio>
+
+#include "third_party/libwebm/mkvparser.hpp"
+#include "third_party/libwebm/mkvreader.hpp"
+
+namespace {
+
+void reset(struct WebmInputContext *const webm_ctx) {
+  if (webm_ctx->reader != NULL) {
+    mkvparser::MkvReader *const reader =
+        reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
+    delete reader;
+  }
+  if (webm_ctx->segment != NULL) {
+    mkvparser::Segment *const segment =
+        reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+    delete segment;
+  }
+  if (webm_ctx->buffer != NULL) {
+    delete[] webm_ctx->buffer;
+  }
+  webm_ctx->reader = NULL;
+  webm_ctx->segment = NULL;
+  webm_ctx->buffer = NULL;
+  webm_ctx->cluster = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->video_track_index = 0;
+  webm_ctx->timestamp_ns = 0;
+}
+
+void get_first_cluster(struct WebmInputContext *const webm_ctx) {
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+  const mkvparser::Cluster *const cluster = segment->GetFirst();
+  webm_ctx->cluster = cluster;
+}
+
+void rewind_and_reset(struct WebmInputContext *const webm_ctx,
+                      struct VpxInputContext *const vpx_ctx) {
+  rewind(vpx_ctx->file);
+  reset(webm_ctx);
+}
+
+}  // namespace
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx) {
+  mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
+  webm_ctx->reader = reader;
+
+  mkvparser::EBMLHeader header;
+  long long pos = 0;
+  if (header.Parse(reader, pos) < 0) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  mkvparser::Segment* segment;
+  if (mkvparser::Segment::CreateInstance(reader, pos, segment)) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+  webm_ctx->segment = segment;
+  if (segment->Load() < 0) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  const mkvparser::Tracks *const tracks = segment->GetTracks();
+  const mkvparser::VideoTrack* video_track = NULL;
+  for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
+    const mkvparser::Track* const track = tracks->GetTrackByIndex(i);
+    if (track->GetType() == mkvparser::Track::kVideo) {
+      video_track = static_cast<const mkvparser::VideoTrack*>(track);
+      webm_ctx->video_track_index = track->GetNumber();
+      break;
+    }
+  }
+
+  if (video_track == NULL) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  if (!strncmp(video_track->GetCodecId(), "V_VP8", 5)) {
+    vpx_ctx->fourcc = VP8_FOURCC;
+  } else if (!strncmp(video_track->GetCodecId(), "V_VP9", 5)) {
+    vpx_ctx->fourcc = VP9_FOURCC;
+  } else {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  vpx_ctx->framerate.denominator = 0;
+  vpx_ctx->framerate.numerator = 0;
+  vpx_ctx->width = static_cast<uint32_t>(video_track->GetWidth());
+  vpx_ctx->height = static_cast<uint32_t>(video_track->GetHeight());
+
+  get_first_cluster(webm_ctx);
+
+  return 1;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size) {
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+  const mkvparser::Cluster* cluster =
+      reinterpret_cast<const mkvparser::Cluster*>(webm_ctx->cluster);
+  const mkvparser::Block *block =
+      reinterpret_cast<const mkvparser::Block*>(webm_ctx->block);
+  const mkvparser::BlockEntry *block_entry =
+      reinterpret_cast<const mkvparser::BlockEntry*>(webm_ctx->block_entry);
+  bool block_entry_eos = false;
+  do {
+    long status = 0;
+    bool get_new_block = false;
+    if (block_entry == NULL && !block_entry_eos) {
+      status = cluster->GetFirst(block_entry);
+      get_new_block = true;
+    } else if (block_entry_eos || block_entry->EOS()) {
+      cluster = segment->GetNext(cluster);
+      if (cluster == NULL || cluster->EOS()) {
+        *bytes_in_buffer = 0;
+        return 1;
+      }
+      status = cluster->GetFirst(block_entry);
+      block_entry_eos = false;
+      get_new_block = true;
+    } else if (block == NULL ||
+               webm_ctx->block_frame_index == block->GetFrameCount() ||
+               block->GetTrackNumber() != webm_ctx->video_track_index) {
+      status = cluster->GetNext(block_entry, block_entry);
+      if (block_entry == NULL || block_entry->EOS()) {
+        block_entry_eos = true;
+        continue;
+      }
+      get_new_block = true;
+    }
+    if (status) {
+      return -1;
+    }
+    if (get_new_block) {
+      block = block_entry->GetBlock();
+      webm_ctx->block_frame_index = 0;
+    }
+  } while (block->GetTrackNumber() != webm_ctx->video_track_index ||
+           block_entry_eos);
+
+  webm_ctx->cluster = cluster;
+  webm_ctx->block_entry = block_entry;
+  webm_ctx->block = block;
+
+  const mkvparser::Block::Frame& frame =
+      block->GetFrame(webm_ctx->block_frame_index);
+  ++webm_ctx->block_frame_index;
+  if (frame.len > static_cast<long>(*buffer_size)) {
+    delete[] *buffer;
+    *buffer = new uint8_t[frame.len];
+    if (*buffer == NULL) {
+      return -1;
+    }
+    *buffer_size = frame.len;
+    webm_ctx->buffer = *buffer;
+  }
+  *bytes_in_buffer = frame.len;
+  webm_ctx->timestamp_ns = block->GetTime(cluster);
+
+  mkvparser::MkvReader *const reader =
+      reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
+  return frame.Read(reader, *buffer) ? -1 : 0;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx) {
+  uint32_t i = 0;
+  uint8_t *buffer = NULL;
+  size_t bytes_in_buffer = 0;
+  size_t buffer_size = 0;
+  while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
+    if (webm_read_frame(webm_ctx, &buffer, &bytes_in_buffer, &buffer_size)) {
+      break;
+    }
+    ++i;
+  }
+  vpx_ctx->framerate.numerator = (i - 1) * 1000000;
+  vpx_ctx->framerate.denominator =
+      static_cast<int>(webm_ctx->timestamp_ns / 1000);
+  delete[] buffer;
+
+  get_first_cluster(webm_ctx);
+  webm_ctx->block = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->timestamp_ns = 0;
+
+  return 0;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) {
+  reset(webm_ctx);
+}
diff --git a/webmdec.h b/webmdec.h
index 108c6ad..29b815d 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -16,34 +16,53 @@
 extern "C" {
 #endif
 
-struct nestegg;
-struct nestegg_packet;
 struct VpxInputContext;
 
 struct WebmInputContext {
-  uint32_t chunk;
-  uint32_t chunks;
-  uint32_t video_track;
-  struct nestegg *nestegg_ctx;
-  struct nestegg_packet *pkt;
+  void *reader;
+  void *segment;
+  uint8_t *buffer;
+  const void *cluster;
+  const void *block_entry;
+  const void *block;
+  int block_frame_index;
+  int video_track_index;
+  uint64_t timestamp_ns;
 };
 
+// Checks if the input is a WebM file. If so, initializes WebMInputContext so
+// that webm_read_frame can be called to retrieve a video frame.
+// Returns 1 on success and 0 on failure or input is not WebM file.
+// TODO(vigneshv): Refactor this function into two smaller functions specific
+// to their task.
 int file_is_webm(struct WebmInputContext *webm_ctx,
                  struct VpxInputContext *vpx_ctx);
 
-/* Reads a WebM video frame. Return values:
- *   0 - Success
- *   1 - End of File
- *  -1 - Error
- */
+// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
+// by this function. For the first call, |buffer| should be NULL and
+// |*bytes_in_buffer| should be 0. Once all the frames are read and used,
+// webm_free() should be called, otherwise there will be a leak.
+// Parameters:
+//      webm_ctx - WebmInputContext object
+//      buffer - pointer where the frame data will be filled.
+//      bytes_in_buffer - pointer to buffer size.
+//      buffer_size - unused TODO(vigneshv): remove this
+// Return values:
+//      0 - Success
+//      1 - End of Stream
+//     -1 - Error
+// TODO(vigneshv): Make the return values consistent across all functions in
+// this file.
 int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
                     size_t *bytes_in_buffer,
                     size_t *buffer_size);
 
+// Guesses the frame rate of the input file based on the container timestamps.
 int webm_guess_framerate(struct WebmInputContext *webm_ctx,
                          struct VpxInputContext *vpx_ctx);
 
+// Resets the WebMInputContext.
 void webm_free(struct WebmInputContext *webm_ctx);
 
 #ifdef __cplusplus