Merge "Cleaning up vpx_codec_get_cx_data() function."
diff --git a/.gitignore b/.gitignore
index dfeae99..aa95d57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,19 @@
+*.S
 *.a
 *.asm.s
 *.d
-*.gcno
 *.gcda
+*.gcno
 *.o
 *~
-/*.ivf
-/*.ivf.md5
+.cproject
+.project
+.settings
 /*-*.mk
 /*.asm
 /*.doxy
+/*.ivf
+/*.ivf.md5
 /.bins
 /.deps
 /.docs
@@ -18,16 +22,24 @@
 /Makefile
 /config.log
 /config.mk
-/decode_to_md5
-/decode_to_md5.dox
-/decode_with_drops
-/decode_with_drops.dox
 /docs/
 /doxyfile
-/error_resilient
-/error_resilient.dox
-/force_keyframe
-/force_keyframe.dox
+/examples/*.dox
+/examples/decode_to_md5
+/examples/decode_with_drops
+/examples/decode_with_partial_drops
+/examples/error_resilient
+/examples/example_xma
+/examples/force_keyframe
+/examples/postproc
+/examples/simple_decoder
+/examples/simple_encoder
+/examples/twopass_encoder
+/examples/vp8_multi_resolution_encoder
+/examples/vp8_set_maps
+/examples/vp8cx_set_ref
+/examples/vp9_spatial_scalable_encoder
+/examples/vpx_temporal_scalable_patterns
 /ivfdec
 /ivfdec.dox
 /ivfenc
@@ -35,31 +47,17 @@
 /libvpx.so*
 /libvpx.ver
 /obj_int_extract
-/postproc
-/postproc.dox
 /samples.dox
-/simple_decoder
-/simple_decoder.dox
-/simple_encoder
-/simple_encoder.dox
 /test_libvpx
-/twopass_encoder
-/twopass_encoder.dox
 /vp8_api1_migration.dox
-/vp8_scalable_patterns
-/vp8_scalable_patterns.dox
-/vp8_set_maps
-/vp8_set_maps.dox
-/vp8cx_set_ref
-/vp8cx_set_ref.dox
+/vp[89x]_rtcd.h
 /vpx.pc
 /vpx_config.c
 /vpx_config.h
-/vpx_rtcd.h
+/vpx_scale_rtcd.h
 /vpx_version.h
 /vpxdec
+/vpxdec.dox
 /vpxenc
+/vpxenc.dox
 TAGS
-.cproject
-.project
-.settings
diff --git a/build/arm-msvs/obj_int_extract.bat b/build/arm-msvs/obj_int_extract.bat
index 7fd16a3..267ed61 100644
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -7,8 +7,12 @@
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on
 
+REM Arguments:
+REM   %1 - Relative path to the directory containing the vp8 and vpx_scale
+REM        source directories.
+REM   %2 - Path to obj_int_extract.exe.
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
+%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
 cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
-obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
+%2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index b5151da..c379c74 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1199,8 +1199,8 @@
     fi
 
     # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o \
+         "${tgt_os#darwin}" = "${tgt_os}"  ]; then
       soft_enable use_x86inc
     fi
 
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 7df0334..5936370 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -371,7 +371,7 @@
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -412,7 +412,6 @@
                             obj_int_extract)
                                 tag Tool \
                                     Name="VCLinkerTool" \
-                                    OutputFile="${name}.exe" \
                                     GenerateDebugInformation="true" \
                             ;;
                             *)
@@ -479,7 +478,7 @@
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -522,7 +521,6 @@
                             obj_int_extract)
                                 tag Tool \
                                     Name="VCLinkerTool" \
-                                    OutputFile="${name}.exe" \
                                     GenerateDebugInformation="true" \
                             ;;
                             *)
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index a6315b9..4558aa1 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -156,6 +156,10 @@
                 objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,/,_,g')
 
                 if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                    # Avoid object file name collisions, i.e. vpx_config.c and
+                    # vpx_config.asm produce the same object file without
+                    # this additional suffix.
+                    objf=${objf%.obj}_asm.obj
                     open_tag CustomBuild \
                         Include=".\\$f"
                     for plat in "${platforms[@]}"; do
@@ -430,6 +434,14 @@
                 Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
             tag_content OutDir "\$(SolutionDir)$plat_no_ws\\\$(Configuration)\\"
             tag_content IntDir "$plat_no_ws\\\$(Configuration)\\${name}\\"
+            if [ "$proj_kind" == "lib" ]; then
+              if [ "$config" == "Debug" ]; then
+                config_suffix=d
+              else
+                config_suffix=""
+              fi
+              tag_content TargetName "${name}${lib_sfx}${config_suffix}"
+            fi
             close_tag PropertyGroup
         done
     done
@@ -438,9 +450,13 @@
         for config in Debug Release; do
             open_tag ItemDefinitionGroup \
                 Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
-            if [ "$name" = "vpx" ]; then
+            if [ "$name" == "vpx" ]; then
+                hostplat=$plat
+                if [ "$hostplat" == "ARM" ]; then
+                    hostplat=Win32
+                fi
                 open_tag PreBuildEvent
-                tag_content Command "call obj_int_extract.bat $src_path_bare"
+                tag_content Command "call obj_int_extract.bat $src_path_bare $hostplat\\\$(Configuration)"
                 close_tag PreBuildEvent
             fi
             open_tag ClCompile
@@ -448,7 +464,6 @@
                 opt=Disabled
                 runtime=$debug_runtime
                 curlibs=$debug_libs
-                confsuffix=d
                 case "$name" in
                 obj_int_extract)
                     debug=DEBUG
@@ -461,7 +476,6 @@
                 opt=MaxSpeed
                 runtime=$release_runtime
                 curlibs=$libs
-                confsuffix=""
                 tag_content FavorSizeOrSpeed Speed
                 debug=NDEBUG
             fi
@@ -483,9 +497,7 @@
             case "$proj_kind" in
             exe)
                 open_tag Link
-                if [ "$name" = "obj_int_extract" ]; then
-                    tag_content OutputFile "${name}.exe"
-                else
+                if [ "$name" != "obj_int_extract" ]; then
                     tag_content AdditionalDependencies "$curlibs"
                     tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)"
                 fi
@@ -499,9 +511,6 @@
                 close_tag Link
                 ;;
             lib)
-                open_tag Lib
-                tag_content OutputFile "\$(OutDir)${name}${lib_sfx}${confsuffix}.lib"
-                close_tag Lib
                 ;;
             esac
             close_tag ItemDefinitionGroup
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index ed03713..93c9adc 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -333,6 +333,7 @@
 #
 # Main Driver
 #
+ALL_FUNCS=$(export LC_ALL=C; echo $ALL_FUNCS | tr ' ' '\n' | sort |tr '\n' ' ')
 require c
 case $arch in
   x86)
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
index 4e9b0ec..44d095d 100644
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,6 +7,9 @@
 REM   be found in the AUTHORS file in the root of the source tree.
 echo on
 
+REM Arguments:
+REM   %1 - Relative path to the directory containing the vp8 source directory.
+REM   %2 - Path to obj_int_extract.exe.
 cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
+%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
diff --git a/docs.mk b/docs.mk
index 9426f76..797b466 100644
--- a/docs.mk
+++ b/docs.mk
@@ -30,7 +30,9 @@
 
 
 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
+EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
 
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
 doxyfile: libs.doxy_template libs.doxy
 	@echo "    [CREATE] $@"
 	@cat $^ > $@
diff --git a/examples.mk b/examples.mk
index 5f12b6b..85b8457 100644
--- a/examples.mk
+++ b/examples.mk
@@ -19,6 +19,8 @@
 # while EXAMPLES demonstrate specific portions of the API.
 UTILS-$(CONFIG_DECODERS)    += vpxdec.c
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
+vpxdec.SRCS                 += vpx_ports/mem_ops.h
+vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
@@ -26,13 +28,13 @@
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += webmdec.c webmdec.h
 vpxdec.SRCS                 += y4menc.c y4menc.h
-vpxdec.SRCS                 += nestegg/halloc/halloc.h
-vpxdec.SRCS                 += nestegg/halloc/src/align.h
-vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
-vpxdec.SRCS                 += nestegg/halloc/src/hlist.h
-vpxdec.SRCS                 += nestegg/halloc/src/macros.h
-vpxdec.SRCS                 += nestegg/include/nestegg/nestegg.h
-vpxdec.SRCS                 += nestegg/src/nestegg.c
+vpxdec.SRCS                 += third_party/nestegg/halloc/halloc.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/align.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/halloc.c
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/hlist.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/macros.h
+vpxdec.SRCS                 += third_party/nestegg/include/nestegg/nestegg.h
+vpxdec.SRCS                 += third_party/nestegg/src/nestegg.c
 vpxdec.SRCS                 += $(LIBYUV_SRCS)
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
@@ -54,7 +56,7 @@
 vpxenc.SRCS                 += $(LIBYUV_SRCS)
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
-UTILS-$(CONFIG_VP9_ENCODER)    += vp9_spatial_scalable_encoder.c
+EXAMPLES-$(CONFIG_VP9_ENCODER)    += vp9_spatial_scalable_encoder.c
 vp9_spatial_scalable_encoder.SRCS += args.c args.h
 vp9_spatial_scalable_encoder.SRCS += ivfenc.c ivfenc.h
 vp9_spatial_scalable_encoder.SRCS += tools_common.c tools_common.h
@@ -85,12 +87,16 @@
 simple_decoder.SRCS                += tools_common.h tools_common.c
 simple_decoder.SRCS                += video_common.h
 simple_decoder.SRCS                += video_reader.h video_reader.c
+simple_decoder.SRCS                += vpx_ports/mem_ops.h
+simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_VP8_DECODER)     += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
+postproc.SRCS                      += vpx_ports/mem_ops.h
+postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_VP8_DECODER)     += decode_to_md5.c
@@ -99,6 +105,8 @@
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
 decode_to_md5.SRCS                 += video_common.h
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
+decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
+decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_VP8_ENCODER)  += simple_encoder.c
@@ -124,6 +132,8 @@
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
+decode_with_drops.SRCS          += vpx_ports/mem_ops.h
+decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
@@ -137,6 +147,10 @@
 error_resilient.DESCRIPTION      = Error Resiliency Feature
 
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8_set_maps.c
+vp8_set_maps.SRCS                  += ivfenc.h ivfenc.c
+vp8_set_maps.SRCS                  += tools_common.h tools_common.c
+vp8_set_maps.SRCS                  += video_common.h
+vp8_set_maps.SRCS                  += video_writer.h video_writer.c
 vp8_set_maps.GUID                   = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 vp8_set_maps.DESCRIPTION            = VP8 set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
@@ -281,3 +295,36 @@
                                $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
 $(foreach proj,$(call enabled,PROJECTS),\
     $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+samples.dox: examples.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page samples Sample Code" > $@
+	@echo "    This SDK includes a number of sample applications."\
+	      "Each sample documents a feature of the SDK in both prose"\
+	      "and the associated C code."\
+	      "The following samples are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo >> $@
+	@echo "    In addition, the SDK contains a number of utilities."\
+              "Since these utilities are built upon the concepts described"\
+              "in the sample code listed above, they are not documented in"\
+              "pieces like the samples are. Their source is included here"\
+              "for reference. The following utilities are included:" >> $@
+	@$(foreach ex,$(sort $(UTILS:.c=)),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
+DOCS-yes += examples.doxy samples.dox
+examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
+	@echo "INPUT += $^" > $@
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index aabac60..28d1ad5 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -115,7 +115,7 @@
     size_t frame_size = 0;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index c6f7d43..af1aa63 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -120,7 +120,7 @@
     int skip;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
     ++frame_cnt;
diff --git a/examples/error_resilient.c b/examples/error_resilient.c
index ef0a6c38..19235c8 100644
--- a/examples/error_resilient.c
+++ b/examples/error_resilient.c
@@ -118,7 +118,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
diff --git a/examples/force_keyframe.c b/examples/force_keyframe.c
index f03b3d0..6531e47 100644
--- a/examples/force_keyframe.c
+++ b/examples/force_keyframe.c
@@ -119,7 +119,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
diff --git a/examples/postproc.c b/examples/postproc.c
index 2912fe6..be08e92 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -118,7 +118,7 @@
     };
 
     // Decode the frame with 15ms deadline
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 15000))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
       die_codec(&codec, "Failed to decode frame");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index b0ca77d..8c15051 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -134,7 +134,7 @@
     size_t frame_size = 0;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index f16db66..8bca18c 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -69,9 +69,9 @@
 static void get_frame_stats(vpx_codec_ctx_t *ctx,
                             const vpx_image_t *img,
                             vpx_codec_pts_t pts,
-                            uint64_t duration,
+                            unsigned int duration,
                             vpx_enc_frame_flags_t flags,
-                            uint64_t deadline,
+                            unsigned int deadline,
                             vpx_fixed_buf_t *stats) {
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
@@ -94,9 +94,9 @@
 static void encode_frame(vpx_codec_ctx_t *ctx,
                          const vpx_image_t *img,
                          vpx_codec_pts_t pts,
-                         uint64_t duration,
+                         unsigned int duration,
                          vpx_enc_frame_flags_t flags,
-                         uint64_t deadline,
+                         unsigned int deadline,
                          VpxVideoWriter *writer) {
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 4c29056..1fef7db 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -18,11 +18,12 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
-#include "math.h"
+#include <math.h>
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
+#include "./tools_common.h"
 #define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056
 
@@ -44,21 +45,6 @@
 #include "third_party/libyuv/include/libyuv/scale.h"
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 
-static double vp8_mse2psnr(double Samples, double Peak, double Mse)
-{
-    double psnr;
-
-    if ((double)Mse > 0.0)
-        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-    else
-        psnr = 60;      // Limit to prevent / 0
-
-    if (psnr > 60)
-        psnr = 60;
-
-    return psnr;
-}
-
 static void die(const char *fmt, ...) {
     va_list ap;
 
@@ -454,8 +440,8 @@
         if ( (show_psnr) && (psnr_count[i]>0) )
         {
             int j;
-            double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0,
-                                         psnr_sse_total[i]);
+            double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+                                        psnr_sse_total[i]);
 
             fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
 
diff --git a/examples/vp8_set_maps.c b/examples/vp8_set_maps.c
index 4c0e8a0..f3cc9a7 100644
--- a/examples/vp8_set_maps.c
+++ b/examples/vp8_set_maps.c
@@ -44,253 +44,197 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc    0x30385056
+#include "vpx/vpx_encoder.h"
 
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
+#include "./tools_common.h"
+#include "./video_writer.h"
 
-static void mem_put_le16(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static void mem_put_le32(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
-    mem[2] = val>>16;
-    mem[3] = val>>24;
+static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
+                        vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_roi_map_t roi = {0};
+
+  roi.rows = cfg->g_h / 16;
+  roi.cols = cfg->g_w / 16;
+
+  roi.delta_q[0] = 0;
+  roi.delta_q[1] = -2;
+  roi.delta_q[2] = -4;
+  roi.delta_q[3] = -6;
+
+  roi.delta_lf[0] = 0;
+  roi.delta_lf[1] = 1;
+  roi.delta_lf[2] = 2;
+  roi.delta_lf[3] = 3;
+
+  roi.static_threshold[0] = 1500;
+  roi.static_threshold[1] = 1000;
+  roi.static_threshold[2] = 500;
+  roi.static_threshold[3] = 0;
+
+  roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols);
+  for (i = 0; i < roi.rows * roi.cols; ++i)
+    roi.roi_map[i] = i % 4;
+
+  if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi))
+    die_codec(codec, "Failed to set ROI map");
+
+  free(roi.roi_map);
 }
 
-static void die(const char *fmt, ...) {
-    va_list ap;
+static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
+                           vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_active_map_t map = {0};
 
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
+  map.rows = cfg->g_h / 16;
+  map.cols = cfg->g_w / 16;
+
+  map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  for (i = 0; i < map.rows * map.cols; ++i)
+    map.active_map[i] = i % 2;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+
+  free(map.active_map);
 }
 
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
+static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
+                             vpx_codec_ctx_t *codec) {
+  vpx_active_map_t map = {0};
 
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
+  map.rows = cfg->g_h / 16;
+  map.cols = cfg->g_w / 16;
+  map.active_map = NULL;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
 }
 
-static int read_frame(FILE *f, vpx_image_t *img) {
-    size_t nbytes, to_read;
-    int    res = 1;
+static void encode_frame(vpx_codec_ctx_t *codec,
+                         vpx_image_t *img,
+                         int frame_index,
+                         VpxVideoWriter *writer) {
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
+                                               VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
 
-    to_read = img->w*img->h*3/2;
-    nbytes = fread(img->planes[0], 1, to_read, f);
-    if(nbytes != to_read) {
-        res = 0;
-        if(nbytes > 0)
-            printf("Warning: Read partial frame. Check your width & height!\n");
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
     }
-    return res;
-}
-
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  int frame_cnt) {
-    char header[32];
-
-    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-        return;
-    header[0] = 'D';
-    header[1] = 'K';
-    header[2] = 'I';
-    header[3] = 'F';
-    mem_put_le16(header+4,  0);                   /* version */
-    mem_put_le16(header+6,  32);                  /* headersize */
-    mem_put_le32(header+8,  fourcc);              /* headersize */
-    mem_put_le16(header+12, cfg->g_w);            /* width */
-    mem_put_le16(header+14, cfg->g_h);            /* height */
-    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
-    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
-    mem_put_le32(header+24, frame_cnt);           /* length */
-    mem_put_le32(header+28, 0);                   /* unused */
-
-    (void) fwrite(header, 1, 32, outfile);
-}
-
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt)
-{
-    char             header[12];
-    vpx_codec_pts_t  pts;
-
-    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-        return;
-
-    pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
-    mem_put_le32(header+4, pts&0xFFFFFFFF);
-    mem_put_le32(header+8, pts >> 32);
-
-    (void) fwrite(header, 1, 12, outfile);
+  }
 }
 
 int main(int argc, char **argv) {
-    FILE                *infile, *outfile;
-    vpx_codec_ctx_t      codec;
-    vpx_codec_enc_cfg_t  cfg;
-    int                  frame_cnt = 0;
-    vpx_image_t          raw;
-    vpx_codec_err_t      res;
-    long                 width;
-    long                 height;
-    int                  frame_avail;
-    int                  got_data;
-    int                  flags = 0;
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  int frame_count = 0;
+  vpx_image_t raw = {0};
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 2;        // TODO(dkovalev) add command line argument
+  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
 
-    /* Open files */
-    if(argc!=5)
-        die("Usage: %s <width> <height> <infile> <outfile>\n", argv[0]);
-    width = strtol(argv[1], NULL, 0);
-    height = strtol(argv[2], NULL, 0);
-    if(width < 16 || width%2 || height <16 || height%2)
-        die("Invalid resolution: %ldx%ld", width, height);
-    if(!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 1))
-        die("Faile to allocate image", width, height);
-    if(!(outfile = fopen(argv[4], "wb")))
-        die("Failed to open %s for writing", argv[4]);
+  exec_name = argv[0];
 
-    printf("Using %s\n",vpx_codec_iface_name(interface));
+  if (argc != 5)
+    die("Invalid number of arguments");
 
-    /* Populate encoder configuration */
-    res = vpx_codec_enc_config_default(interface, &cfg, 0);
-    if(res) {
-        printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-        return EXIT_FAILURE;
+  encoder = get_vpx_encoder_by_name("vp8");  // only vp8 for now
+  if (!encoder)
+    die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->interface()));
+
+  res = vpx_codec_enc_config_default(encoder->interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  while (vpx_img_read(&raw, infile)) {
+    ++frame_count;
+
+    if (frame_count == 22) {
+      set_roi_map(&cfg, &codec);
+    } else if (frame_count == 33) {
+      set_active_map(&cfg, &codec);
+    } else if (frame_count == 44) {
+      unset_active_map(&cfg, &codec);
     }
 
-    /* Update the default configuration with our settings */
-    cfg.rc_target_bitrate = width * height * cfg.rc_target_bitrate
-                            / cfg.g_w / cfg.g_h;
-    cfg.g_w = width;
-    cfg.g_h = height;
+    encode_frame(&codec, &raw, frame_count, writer);
+  }
+  encode_frame(&codec, NULL, -1, writer);
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
 
-    write_ivf_file_header(outfile, &cfg, 0);
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
 
+  vpx_video_writer_close(writer);
 
-        /* Open input file for this encoding pass */
-        if(!(infile = fopen(argv[3], "rb")))
-            die("Failed to open %s for reading", argv[3]);
-
-        /* Initialize codec */
-        if(vpx_codec_enc_init(&codec, interface, &cfg, 0))
-            die_codec(&codec, "Failed to initialize encoder");
-
-        frame_avail = 1;
-        got_data = 0;
-        while(frame_avail || got_data) {
-            vpx_codec_iter_t iter = NULL;
-            const vpx_codec_cx_pkt_t *pkt;
-
-            if(frame_cnt + 1 == 22) {
-                vpx_roi_map_t  roi;
-                unsigned int   i;
-
-                roi.rows = cfg.g_h/16;
-                roi.cols = cfg.g_w/16;
-
-                roi.delta_q[0] = 0;
-                roi.delta_q[1] = -2;
-                roi.delta_q[2] = -4;
-                roi.delta_q[3] = -6;
-
-                roi.delta_lf[0] = 0;
-                roi.delta_lf[1] = 1;
-                roi.delta_lf[2] = 2;
-                roi.delta_lf[3] = 3;
-
-                roi.static_threshold[0] = 1500;
-                roi.static_threshold[1] = 1000;
-                roi.static_threshold[2] =  500;
-                roi.static_threshold[3] =    0;
-
-                /* generate an ROI map for example */
-                roi.roi_map = malloc(roi.rows * roi.cols);
-                for(i=0;i<roi.rows*roi.cols;i++)
-                    roi.roi_map[i] = i & 3;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
-                    die_codec(&codec, "Failed to set ROI map");
-
-                free(roi.roi_map);
-            } else if(frame_cnt + 1 == 33) {
-                vpx_active_map_t  active;
-                unsigned int      i;
-
-                active.rows = cfg.g_h/16;
-                active.cols = cfg.g_w/16;
-
-                /* generate active map for example */
-                active.active_map = malloc(active.rows * active.cols);
-                for(i=0;i<active.rows*active.cols;i++)
-                    active.active_map[i] = i & 1;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ACTIVEMAP, &active))
-                    die_codec(&codec, "Failed to set active map");
-
-                free(active.active_map);
-            } else if(frame_cnt + 1 == 44) {
-                vpx_active_map_t  active;
-
-                active.rows = cfg.g_h/16;
-                active.cols = cfg.g_w/16;
-
-                /* pass in null map to disable active_map*/
-                active.active_map = NULL;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ACTIVEMAP, &active))
-                    die_codec(&codec, "Failed to set active map");
-            }
-            frame_avail = read_frame(infile, &raw);
-            if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
-                                1, flags, VPX_DL_REALTIME))
-                die_codec(&codec, "Failed to encode frame");
-            got_data = 0;
-            while( (pkt = vpx_codec_get_cx_data(&codec, &iter)) ) {
-                got_data = 1;
-                switch(pkt->kind) {
-                case VPX_CODEC_CX_FRAME_PKT:
-                    write_ivf_frame_header(outfile, pkt);
-                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                                  outfile);
-                    break;
-                default:
-                    break;
-                }
-                printf(pkt->kind == VPX_CODEC_CX_FRAME_PKT
-                       && (pkt->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
-                fflush(stdout);
-            }
-            frame_cnt++;
-        }
-        printf("\n");
-        fclose(infile);
-
-    printf("Processed %d frames.\n",frame_cnt-1);
-    vpx_img_free(&raw);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    /* Try to rewrite the file header with the actual frame count */
-    if(!fseek(outfile, 0, SEEK_SET))
-        write_ivf_file_header(outfile, &cfg, frame_cnt-1);
-    fclose(outfile);
-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index 5a67578..f87dd35 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -139,7 +139,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
diff --git a/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c
similarity index 88%
rename from vp9_spatial_scalable_encoder.c
rename to examples/vp9_spatial_scalable_encoder.c
index bbbe7ed..98dc3f5 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/examples/vp9_spatial_scalable_encoder.c
@@ -54,12 +54,18 @@
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
 static const arg_def_t quantizers_arg =
-    ARG_DEF("q", "quantizers", 1, "quantizers (lowest to highest layer)");
+    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
+            "be applied to key frames if -qn is not specified (lowest to "
+            "highest layer)");
+static const arg_def_t quantizers_keyframe_arg =
+    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
+        "to highest layer)");
 
 static const arg_def_t *svc_args[] = {
   &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
+  &quantizers_keyframe_arg, NULL
 };
 
 static const SVC_ENCODING_MODE default_encoding_mode =
@@ -75,10 +81,10 @@
 static const uint32_t default_kf_dist = 100;
 
 typedef struct {
-  char *output_filename;
+  const char *input_filename;
+  const char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
-  struct VpxInputContext input_ctx;
 } AppInput;
 
 static const char *exec_name;
@@ -94,8 +100,10 @@
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg;
-  char **argv, **argi, **argj;
+  struct arg arg = {0};
+  char **argv = NULL;
+  char **argi = NULL;
+  char **argj = NULL;
   vpx_codec_err_t res;
 
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
@@ -148,7 +156,9 @@
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       vpx_svc_set_scale_factors(svc_ctx, arg.val);
     } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val);
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
+    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
     } else {
       ++argj;
     }
@@ -162,7 +172,7 @@
   if (argv[0] == NULL || argv[1] == 0) {
     usage_exit();
   }
-  app_input->input_ctx.filename = argv[0];
+  app_input->input_filename = argv[0];
   app_input->output_filename = argv[1];
   free(argv);
 
@@ -196,6 +206,7 @@
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
+  FILE *infile = NULL;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -206,8 +217,8 @@
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
 
-  if (!(app_input.input_ctx.file = fopen(app_input.input_ctx.filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_ctx.filename);
+  if (!(infile = fopen(app_input.input_filename, "rb")))
+    die("Failed to open %s for reading\n", app_input.input_filename);
 
   // Initialize codec
   if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
@@ -229,13 +240,13 @@
     die("Failed to open %s for writing\n", app_input.output_filename);
 
   // skip initial frames
-  for (i = 0; i < app_input.frames_to_skip; ++i) {
-    read_yuv_frame(&app_input.input_ctx, &raw);
-  }
+  for (i = 0; i < app_input.frames_to_skip; ++i)
+    vpx_img_read(&raw, infile);
 
   // Encode frames
   while (frame_cnt < app_input.frames_to_code) {
-    if (read_yuv_frame(&app_input.input_ctx, &raw)) break;
+    if (!vpx_img_read(&raw, infile))
+      break;
 
     res = vpx_svc_encode(&svc_ctx, &codec, &raw, pts, frame_duration,
                          VPX_DL_REALTIME);
@@ -255,7 +266,7 @@
 
   printf("Processed %d frames\n", frame_cnt);
 
-  fclose(app_input.input_ctx.file);
+  fclose(infile);
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
   vpx_video_writer_close(writer);
diff --git a/examples/vpx_temporal_scalable_patterns.c b/examples/vpx_temporal_scalable_patterns.c
index 73278a8..6ec1b62 100644
--- a/examples/vpx_temporal_scalable_patterns.c
+++ b/examples/vpx_temporal_scalable_patterns.c
@@ -41,23 +41,29 @@
   // Number of encoded non-key frames per layer.
   int layer_enc_frames[VPX_TS_MAX_LAYERS];
   // Framerate per layer layer (cumulative).
-  float layer_framerate[VPX_TS_MAX_LAYERS];
+  double layer_framerate[VPX_TS_MAX_LAYERS];
   // Target average frame size per layer (per-frame-bandwidth per layer).
-  float layer_pfb[VPX_TS_MAX_LAYERS];
+  double layer_pfb[VPX_TS_MAX_LAYERS];
   // Actual average frame size per layer.
-  float layer_avg_frame_size[VPX_TS_MAX_LAYERS];
+  double layer_avg_frame_size[VPX_TS_MAX_LAYERS];
   // Average rate mismatch per layer (|target - actual| / target).
-  float layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
+  double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
   // Actual encoding bitrate per layer (cumulative).
-  float layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+  double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
 };
 
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
 static void set_rate_control_metrics(struct RateControlMetrics *rc,
                                      vpx_codec_enc_cfg_t *cfg) {
-  int i = 0;
+  unsigned int i = 0;
   // Set the layer (cumulative) framerate and the target layer (non-cumulative)
   // per-frame-bandwidth, for the rate control encoding stats below.
-  float framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
   rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
   rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] /
       rc->layer_framerate[0];
@@ -80,8 +86,8 @@
 static void printout_rate_control_summary(struct RateControlMetrics *rc,
                                           vpx_codec_enc_cfg_t *cfg,
                                           int frame_cnt) {
-  int i = 0;
-  int check_num_frames = 0;
+  unsigned int i = 0;
+  int tot_num_frames = 0;
   printf("Total number of processed frames: %d\n\n", frame_cnt -1);
   printf("Rate control layer stats for %d layer(s):\n\n",
       cfg->ts_number_layers);
@@ -89,8 +95,9 @@
     const int num_dropped = (i > 0) ?
         (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) :
         (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1);
+    tot_num_frames += rc->layer_input_frames[i];
     rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] *
-        rc->layer_encoding_bitrate[i] / rc->layer_tot_enc_frames[i];
+        rc->layer_encoding_bitrate[i] / tot_num_frames;
     rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] /
         rc->layer_enc_frames[i];
     rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
@@ -105,10 +112,9 @@
         "and perc dropped frames: %d %d %f \n", rc->layer_input_frames[i],
         rc->layer_enc_frames[i],
         100.0 * num_dropped / rc->layer_input_frames[i]);
-    check_num_frames += rc->layer_input_frames[i];
     printf("\n");
   }
-  if ((frame_cnt - 1) != check_num_frames)
+  if ((frame_cnt - 1) != tot_num_frames)
     die("Error: Number of input frames not equal to output! \n");
 }
 
@@ -432,7 +438,7 @@
   int frame_avail;
   int got_data;
   int flags = 0;
-  int i;
+  unsigned int i;
   int pts = 0;  // PTS starts at 0.
   int frame_duration = 1;  // 1 timebase tick per frame.
   int layering_mode = 0;
@@ -441,7 +447,7 @@
   int max_intra_size_pct;
   vpx_svc_layer_id_t layer_id = {0, 0};
   const VpxInterface *encoder = NULL;
-  struct VpxInputContext input_ctx = {0};
+  FILE *infile = NULL;
   struct RateControlMetrics rc;
 
   exec_name = argv[0];
@@ -492,7 +498,7 @@
   cfg.g_timebase.num = strtol(argv[6], NULL, 0);
   cfg.g_timebase.den = strtol(argv[7], NULL, 0);
 
-  for (i = 10; i < 10 + mode_to_num_layers[layering_mode]; ++i) {
+  for (i = 10; (int)i < 10 + mode_to_num_layers[layering_mode]; ++i) {
     cfg.ts_target_bitrate[i - 10] = strtol(argv[i], NULL, 0);
   }
 
@@ -502,8 +508,8 @@
   cfg.rc_resize_allowed = 0;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
-  cfg.rc_undershoot_pct = 100;
-  cfg.rc_overshoot_pct = 15;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
   cfg.rc_buf_initial_sz = 500;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
@@ -516,9 +522,6 @@
   // Disable automatic keyframe placement.
   cfg.kf_min_dist = cfg.kf_max_dist = 3000;
 
-  // Default setting for bitrate: used in special case of 1 layer (case 0).
-  cfg.rc_target_bitrate = cfg.ts_target_bitrate[0];
-
   set_temporal_layer_pattern(layering_mode,
                              &cfg,
                              layer_flags,
@@ -526,9 +529,12 @@
 
   set_rate_control_metrics(&rc, &cfg);
 
+  // Target bandwidth for the whole stream.
+  // Set to ts_target_bitrate for highest layer (total bitrate).
+  cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1];
+
   // Open input file.
-  input_ctx.filename = argv[1];
-  if (!(input_ctx.file = fopen(input_ctx.filename, "rb"))) {
+  if (!(infile = fopen(argv[1], "rb"))) {
     die("Failed to open %s for reading", argv[1]);
   }
 
@@ -565,6 +571,9 @@
   }
   vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
   vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
+  // This controls the maximum target size of the key frame.
+  // For generating smaller key frames, use a smaller max_intra_size_pct
+  // value, like 100 or 200.
   max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
       * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
   vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
@@ -581,7 +590,7 @@
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
-    frame_avail = !read_yuv_frame(&input_ctx, &raw);
+    frame_avail = vpx_img_read(&raw, infile);
     if (frame_avail)
       ++rc.layer_input_frames[layer_id.temporal_layer_id];
     if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, pts, 1, flags,
@@ -621,7 +630,7 @@
     ++frame_cnt;
     pts += frame_duration;
   }
-  fclose(input_ctx.file);
+  fclose(infile);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
 
   if (vpx_codec_destroy(&codec))
diff --git a/ivfdec.c b/ivfdec.c
index 40394a8..6dcd66f 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -12,6 +12,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "vpx_ports/mem_ops.h"
+
 #include "./ivfdec.h"
 
 static const char *IVF_SIGNATURE = "DKIF";
diff --git a/libs.mk b/libs.mk
index eac61f2..302d2af 100644
--- a/libs.mk
+++ b/libs.mk
@@ -164,6 +164,8 @@
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
diff --git a/nestegg/.gitignore b/nestegg/.gitignore
deleted file mode 100644
index b2ba99c..0000000
--- a/nestegg/.gitignore
+++ /dev/null
@@ -1,40 +0,0 @@
-*.lo
-*.o
-*.swp
-*~
-.deps
-.dirstamp
-.libs
-Makefile
-Makefile.in
-_stdint.h
-aclocal.m4
-autom4te.cache
-compile
-config.guess
-config.h
-config.h.in
-config.log
-config.status
-config.sub
-configure
-depcomp
-docs/Doxyfile
-docs/doxygen-build.stamp
-docs/html
-install-sh
-libtool
-ltmain.sh
-m4/libtool.m4
-m4/ltoptions.m4
-m4/ltsugar.m4
-m4/ltversion.m4
-m4/lt~obsolete.m4
-missing
-nestegg-uninstalled.pc
-nestegg.pc
-src/.dirstamp
-src/libnestegg.la
-stamp-h1
-test/test
-include/nestegg/nestegg-stdint.h
diff --git a/nestegg/AUTHORS b/nestegg/AUTHORS
deleted file mode 100644
index 8204f40..0000000
--- a/nestegg/AUTHORS
+++ /dev/null
@@ -1 +0,0 @@
-Matthew Gregan <kinetik@flim.org>
diff --git a/nestegg/Makefile.am b/nestegg/Makefile.am
deleted file mode 100644
index 5006991..0000000
--- a/nestegg/Makefile.am
+++ /dev/null
@@ -1,51 +0,0 @@
-AUTOMAKE_OPTIONS = foreign 1.11 no-dist-gzip dist-bzip2 subdir-objects
-ACLOCAL_AMFLAGS = -I m4
-
-INCLUDES = -I$(top_srcdir)/include -I. -I$(top_srcdir)/halloc
-AM_CFLAGS = -ansi -pedantic -Wall -Wextra -Wno-long-long -O0 -g
-
-SUBDIRS = docs
-
-EXTRA_DIST = \
-	AUTHORS README LICENSE \
-	nestegg-uninstalled.pc.in \
-	m4/as-ac-expand.m4 \
-	m4/pkg.m4 \
-	m4/ax_create_stdint_h.m4 \
-	halloc/src/halloc.c \
-	halloc/halloc.h \
-	halloc/src/align.h \
-	halloc/src/hlist.h \
-	halloc/src/macros.h
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = nestegg.pc
-
-nesteggincludedir = $(includedir)/nestegg
-nestegginclude_HEADERS = include/nestegg/nestegg.h include/nestegg/nestegg-stdint.h
-
-lib_LTLIBRARIES = src/libnestegg.la
-
-src_libnestegg_la_SOURCES = \
-	src/nestegg.c \
-	halloc/src/halloc.c \
-	halloc/halloc.h \
-	halloc/src/align.h \
-	halloc/src/hlist.h \
-	halloc/src/macros.h
-
-check_PROGRAMS = test/test
-
-test_test_SOURCES = test/test.c
-test_test_LDADD = src/libnestegg.la
-
-DISTCLEANFILES = include/nestegg/nestegg-stdint.h
-
-dist-hook:
-	find $(distdir) -type d -name '.git' | xargs rm -rf
-
-debug:
-	$(MAKE) all CFLAGS="@DEBUG@"
-
-profile:
-	$(MAKE) all CFLAGS="@PROFILE@"
diff --git a/nestegg/configure.ac b/nestegg/configure.ac
deleted file mode 100644
index 70f6e0d..0000000
--- a/nestegg/configure.ac
+++ /dev/null
@@ -1,124 +0,0 @@
-dnl ------------------------------------------------
-dnl Initialization and Versioning
-dnl ------------------------------------------------
-
-AC_INIT(libnestegg,[0.1git])
-
-AC_CANONICAL_HOST
-AC_CANONICAL_TARGET
-
-AC_CONFIG_MACRO_DIR([m4])
-
-AM_CONFIG_HEADER([config.h])
-AC_CONFIG_SRCDIR([src/nestegg.c])
-AM_INIT_AUTOMAKE
-
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-dnl Library versioning
-dnl CURRENT, REVISION, AGE
-dnl - library source changed -> increment REVISION
-dnl - interfaces added/removed/changed -> increment CURRENT, REVISION = 0
-dnl - interfaces added -> increment AGE
-dnl - interfaces removed -> AGE = 0
-
-NESTEGG_CURRENT=0
-NESTEGG_REVISION=0
-NESTEGG_AGE=1
-AC_SUBST(NESTEGG_CURRENT)
-AC_SUBST(NESTEGG_REVISION)
-AC_SUBST(NESTEGG_AGE)
-
-
-dnl --------------------------------------------------  
-dnl Check for programs
-dnl --------------------------------------------------  
-
-dnl save $CFLAGS since AC_PROG_CC likes to insert "-g -O2"
-dnl if $CFLAGS is blank
-cflags_save="$CFLAGS"
-AC_PROG_CC
-AC_PROG_CPP
-CFLAGS="$cflags_save"
-
-AM_PROG_CC_C_O
-AC_LIBTOOL_WIN32_DLL
-AM_PROG_LIBTOOL
-
-dnl Check for doxygen
-AC_ARG_ENABLE([doc],
-	AS_HELP_STRING([--enable-doc], [Build API documentation]),
-	[ac_enable_doc=$enableval], [ac_enable_doc=auto])
-
-if test "x$ac_enable_doc" != "xno"; then
-	AC_CHECK_PROG(HAVE_DOXYGEN, doxygen, true, false)
-
-	if test "x$HAVE_DOXYGEN" = "xfalse" -a "x$ac_enable_doc" = "xyes"; then
-		AC_MSG_ERROR([*** API documentation explicitly requested but Doxygen not found])
-	fi
-else
-	HAVE_DOXYGEN=false
-fi
-AM_CONDITIONAL(HAVE_DOXYGEN,$HAVE_DOXYGEN)
-if test $HAVE_DOXYGEN = "false"; then
-        AC_MSG_WARN([*** doxygen not found, API documentation will not be built])
-fi
-
-# Generate portable stdint.h replacement
-AX_CREATE_STDINT_H(include/nestegg/nestegg-stdint.h)
-
-# Test whenever ld supports -version-script
-AC_PROG_LD
-AC_PROG_LD_GNU
-AC_MSG_CHECKING([how to control symbol export])
-
-dnl --------------------------------------------------
-dnl Do substitutions
-dnl --------------------------------------------------
-
-AC_SUBST(DEBUG)
-AC_SUBST(PROFILE)
-
-AC_OUTPUT([
-  Makefile 
-  docs/Makefile
-  docs/Doxyfile
-  nestegg.pc
-  nestegg-uninstalled.pc
-])
-
-AS_AC_EXPAND(LIBDIR, ${libdir})
-AS_AC_EXPAND(INCLUDEDIR, ${includedir})
-AS_AC_EXPAND(BINDIR, ${bindir})
-AS_AC_EXPAND(DOCDIR, ${docdir})
-
-if test $HAVE_DOXYGEN = "false"; then
-  doc_build="no"
-else
-  doc_build="yes"
-fi
-
-AC_MSG_RESULT([
-------------------------------------------------------------------------
-  $PACKAGE $VERSION:  Automatic configuration OK.
-
-  General configuration:
-
-    API Documentation: .......... ${doc_build}
-
-  Installation paths:
-
-    libnestegg: .................. ${LIBDIR}
-    C header files: .............. ${INCLUDEDIR}/nestegg
-    Documentation: ............... ${DOCDIR}
-
-  Building:
-
-    Type 'make' to compile $PACKAGE.
-
-    Type 'make install' to install $PACKAGE.
-
-  Example programs will be built but not installed.
-------------------------------------------------------------------------
-])
-
diff --git a/nestegg/docs/Doxyfile.in b/nestegg/docs/Doxyfile.in
deleted file mode 100644
index e0e9249..0000000
--- a/nestegg/docs/Doxyfile.in
+++ /dev/null
@@ -1,1551 +0,0 @@
-# Doxyfile 1.6.2
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project
-#
-# All text after a hash (#) is considered a comment and will be ignored
-# The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ")
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all
-# text before the first occurrence of this tag. Doxygen uses libiconv (or the
-# iconv built into libc) for the transcoding. See
-# http://www.gnu.org/software/libiconv for the list of possible encodings.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
-# by quotes) that should identify the project.
-
-PROJECT_NAME           = @PACKAGE@
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number.
-# This could be handy for archiving the generated documentation or
-# if some version control system is used.
-
-PROJECT_NUMBER         = @VERSION@
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
-# base path where the generated documentation will be put.
-# If a relative path is entered, it will be relative to the location
-# where doxygen was started. If left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = .
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
-# 4096 sub-directories (in 2 levels) under the output directory of each output
-# format and will distribute the generated files over these directories.
-# Enabling this option can be useful when feeding doxygen a huge amount of
-# source files, where putting all generated files in the same directory would
-# otherwise cause performance problems for the file system.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# The default language is English, other supported languages are:
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
-# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
-# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
-# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
-# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
-# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
-# include brief member descriptions after the members that are listed in
-# the file and class documentation (similar to JavaDoc).
-# Set to NO to disable this.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
-# the brief description of a member or function before the detailed description.
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator
-# that is used to form the text in various listings. Each string
-# in this list, if found as the leading text of the brief description, will be
-# stripped from the text and the result after processing the whole list, is
-# used as the annotated text. Otherwise, the brief description is used as-is.
-# If left blank, the following values are used ("$name" is automatically
-# replaced with the name of the entity): "The $name class" "The $name widget"
-# "The $name file" "is" "provides" "specifies" "contains"
-# "represents" "a" "an" "the"
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# Doxygen will generate a detailed section even if there is only a brief
-# description.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
-# path before files name in the file list and in the header files. If set
-# to NO the shortest path that makes the file name unique will be used.
-
-FULL_PATH_NAMES        = YES
-
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
-# can be used to strip a user-defined part of the path. Stripping is
-# only done if one of the specified strings matches the left-hand part of
-# the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the
-# path to strip.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
-# the path mentioned in the documentation of a class, which tells
-# the reader which header file to include in order to use a class.
-# If left blank only the name of the header file containing the class
-# definition is used. Otherwise one should specify the include paths that
-# are normally passed to the compiler using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
-# (but less readable) file names. This can be useful is your file systems
-# doesn't support long names like on DOS, Mac, or CD-ROM.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
-# will interpret the first line (until the first dot) of a JavaDoc-style
-# comment as the brief description. If set to NO, the JavaDoc
-# comments will behave just like regular Qt-style comments
-# (thus requiring an explicit @brief command for a brief description.)
-
-JAVADOC_AUTOBRIEF      = YES
-
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
-# interpret the first line (until the first dot) of a Qt-style
-# comment as the brief description. If set to NO, the comments
-# will behave just like regular Qt-style comments (thus requiring
-# an explicit \brief command for a brief description.)
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
-# treat a multi-line C++ special comment block (i.e. a block of //! or ///
-# comments) as a brief description. This used to be the default behaviour.
-# The new default is to treat a multi-line C++ comment block as a detailed
-# description. Set this tag to YES if you prefer the old behaviour instead.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
-# member inherits the documentation from any documented member that it
-# re-implements.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
-# a new page for each member. If set to NO, the documentation of a member will
-# be part of the file/class/namespace that contains it.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab.
-# Doxygen uses this value to replace tabs by spaces in code fragments.
-
-TAB_SIZE               = 8
-
-# This tag can be used to specify a number of aliases that acts
-# as commands in the documentation. An alias has the form "name=value".
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to
-# put the command \sideeffect (or @sideeffect) in the documentation, which
-# will result in a user-defined paragraph with heading "Side Effects:".
-# You can put \n's in the value part of an alias to insert newlines.
-
-ALIASES                =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
-# sources only. Doxygen will then generate output that is more tailored for C.
-# For instance, some of the names that are used will be different. The list
-# of all members will be omitted, etc.
-
-OPTIMIZE_OUTPUT_FOR_C  = YES
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
-# sources only. Doxygen will then generate output that is more tailored for
-# Java. For instance, namespaces will be presented as packages, qualified
-# scopes will look different, etc.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources only. Doxygen will then generate output that is more tailored for
-# Fortran.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for
-# VHDL.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it parses.
-# With this tag you can assign which parser to use for a given extension.
-# Doxygen has a built-in mapping, but you can override or extend it using this tag.
-# The format is ext=language, where ext is a file extension, and language is one of
-# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
-# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
-# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
-# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should
-# set this tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
-# func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
-# Doxygen will parse them like normal C++ but will assume all classes use public
-# instead of private inheritance when no explicit protection keyword is present.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate getter
-# and setter methods for a property. Setting this option to YES (the default)
-# will make doxygen to replace the get and set methods by a property in the
-# documentation. This will only work if the methods are indeed getting or
-# setting a simple type. If this is not the case, or you want to show the
-# methods anyway, you should set this option to NO.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
-# the same type (for instance a group of public functions) to be put as a
-# subgroup of that type (e.g. under the Public Functions section). Set it to
-# NO to prevent subgrouping. Alternatively, this can be done per class using
-# the \nosubgrouping command.
-
-SUBGROUPING            = YES
-
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
-# is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically
-# be useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
-# determine which symbols to keep in memory and which to flush to disk.
-# When the cache is full, less often used symbols will be written to disk.
-# For small to medium size projects (<1000 input files) the default value is
-# probably good enough. For larger projects a too small cache size can cause
-# doxygen to be busy swapping symbols to and from disk most of the time
-# causing a significant performance penality.
-# If the system has enough physical memory increasing the cache will improve the
-# performance by keeping more symbols in memory. Note that the value works on
-# a logarithmic scale so increasing the size by one will rougly double the
-# memory usage. The cache size is given by this formula:
-# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
-# corresponding to a cache size of 2^16 = 65536 symbols
-
-SYMBOL_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available.
-# Private class members and static file members will be hidden unless
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
-# will be included in the documentation.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file
-# will be included in the documentation.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
-# defined locally in source files will be included in the documentation.
-# If set to NO only classes defined in header files are included.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local
-# methods, which are defined in the implementation section but not in
-# the interface are included in the documentation.
-# If set to NO (the default) only methods in the interface are included.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base
-# name of the file that contains the anonymous namespace. By default
-# anonymous namespace are hidden.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
-# undocumented members of documented classes, files or namespaces.
-# If set to NO (the default) these members will be included in the
-# various overviews, but no documentation section is generated.
-# This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy.
-# If set to NO (the default) these classes will be included in the various
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
-# friend (class|struct|union) declarations.
-# If set to NO (the default) these declarations will be included in the
-# documentation.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
-# documentation blocks found inside the body of a function.
-# If set to NO (the default) these blocks will be appended to the
-# function's detailed documentation block.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation
-# that is typed after a \internal command is included. If the tag is set
-# to NO (the default) then the documentation will be excluded.
-# Set it to YES to include the internal documentation.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
-# file names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
-# will show members with their full class and namespace scopes in the
-# documentation. If set to YES the scope will be hidden.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
-# will put a list of the files that are included by a file in the documentation
-# of that file.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
-# will list include files with double quotes in the documentation
-# rather than with sharp brackets.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
-# is inserted in the documentation for inline members.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
-# will sort the (detailed) documentation of file and class members
-# alphabetically by member name. If set to NO the members will appear in
-# declaration order.
-
-SORT_MEMBER_DOCS       = NO
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
-# brief documentation of file, namespace and class members alphabetically
-# by member name. If set to NO (the default) the members will appear in
-# declaration order.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
-# hierarchy of group names into alphabetical order. If set to NO (the default)
-# the group names will appear in their defined order.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
-# sorted by fully-qualified names, including namespaces. If set to
-# NO (the default), the class list will be sorted only by class name,
-# not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the
-# alphabetical list.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or
-# disable (NO) the todo list. This list is created by putting \todo
-# commands in the documentation.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or
-# disable (NO) the test list. This list is created by putting \test
-# commands in the documentation.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or
-# disable (NO) the bug list. This list is created by putting \bug
-# commands in the documentation.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
-# disable (NO) the deprecated list. This list is created by putting
-# \deprecated commands in the documentation.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional
-# documentation sections, marked by \if sectionname ... \endif.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
-# the initial value of a variable or define consists of for it to appear in
-# the documentation. If the initializer consists of more lines than specified
-# here it will be hidden. Use a value of 0 to hide initializers completely.
-# The appearance of the initializer of individual variables and defines in the
-# documentation can be controlled using \showinitializer or \hideinitializer
-# command in the documentation regardless of this setting.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
-# at the bottom of the documentation of classes and structs. If set to YES the
-# list will mention the files that were used to generate the documentation.
-
-SHOW_USED_FILES        = YES
-
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
-# This will remove the Files entry from the Quick Index and from the
-# Folder Tree View (if specified). The default is YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
-# Namespaces page.
-# This will remove the Namespaces entry from the Quick Index
-# and from the Folder Tree View (if specified). The default is YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command <command> <input-file>, where <command> is the value of
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
-# provided by doxygen. Whatever the program writes to standard output
-# is used as the file version. See the manual for examples.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
-# doxygen. The layout file controls the global structure of the generated output files
-# in an output format independent way. The create the layout file that represents
-# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
-# file name after the option, if omitted DoxygenLayout.xml will be used as the name
-# of the layout file.
-
-LAYOUT_FILE            =
-
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
-
-QUIET                  = YES
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated by doxygen. Possible values are YES and NO. If left blank
-# NO is used.
-
-WARNINGS               = YES
-
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
-# automatically be disabled.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some
-# parameters in a documented function, or documenting parameters that
-# don't exist or using markup commands wrongly.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be abled to get warnings for
-# functions that are documented, but have no documentation for their parameters
-# or return value. If set to NO (the default) doxygen will only warn about
-# wrong or incomplete parameter documentation, but not about the absence of
-# documentation.
-
-WARN_NO_PARAMDOC       = YES
-
-# The WARN_FORMAT tag determines the format of the warning messages that
-# doxygen can produce. The string should contain the $file, $line, and $text
-# tags, which will be replaced by the file and line number from which the
-# warning originated and the warning text. Optionally the format may contain
-# $version, which will be replaced by the version of the file (if it could
-# be obtained via FILE_VERSION_FILTER)
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning
-# and error messages should be written. If left blank the output is written
-# to stderr.
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag can be used to specify the files and/or directories that contain
-# documented source files. You may enter file names like "myfile.cpp" or
-# directories like "/usr/src/myproject". Separate the files or directories
-# with spaces.
-
-INPUT                  = @top_srcdir@/include/nestegg
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
-# also the default input encoding. Doxygen uses libiconv (or the iconv built
-# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
-# the list of possible encodings.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank the following patterns are tested:
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
-# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories
-# should be searched for input files as well. Possible values are YES and NO.
-# If left blank NO is used.
-
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
-# directories that are symbolic links (a Unix filesystem feature) are excluded
-# from the input.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories. Note that the wildcards are matched
-# against the file with absolute path, so to exclude all test directories
-# for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or
-# directories that contain example code fragments that are included (see
-# the \include command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank all files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude
-# commands irrespective of the value of the RECURSIVE tag.
-# Possible values are YES and NO. If left blank NO is used.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or
-# directories that contain image that are included in the documentation (see
-# the \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command <filter> <input-file>, where <filter>
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
-# input file. Doxygen will then use the output that the filter program writes
-# to standard output.
-# If FILTER_PATTERNS is specified, this tag will be
-# ignored.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis.
-# Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match.
-# The filters are a list of the form:
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
-# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
-# is applied to all files.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will be used to filter the input files when producing source
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
-
-FILTER_SOURCE_FILES    = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will
-# be generated. Documented entities will be cross-referenced with these sources.
-# Note: To get rid of all source code in the generated output, make sure also
-# VERBATIM_HEADERS is set to NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body
-# of functions and classes directly in the documentation.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
-# doxygen to hide any special comment blocks from generated source code
-# fragments. Normal C and C++ comments will always remain visible.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES
-# then for each documented function all documented
-# functions referencing it will be listed.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES
-# then for each documented function all documented entities
-# called/used by that function will be listed.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
-# link to the source code.
-# Otherwise they will link to the documentation.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code
-# will point to the HTML generated by the htags(1) tool instead of doxygen
-# built-in source browser. The htags tool is part of GNU's global source
-# tagging system (see http://www.gnu.org/software/global/global.html). You
-# will need version 4.8.6 or higher.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
-# will generate a verbatim copy of the header file for each class for
-# which an include is specified. Set to NO to disable this.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
-# of all compounds will be generated. Enable this if the project
-# contains a lot of classes, structs, unions or interfaces.
-
-ALPHABETICAL_INDEX     = NO
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all
-# classes will be put under the same header in the alphabetical index.
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
-# should be ignored while generating the index headers.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
-# generate HTML output.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `html' will be used as the default path.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
-# doxygen will generate files with .html extension.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a personal HTML header for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard header.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard footer.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
-# style sheet that is used by each HTML page. It can be used to
-# fine-tune the look of the HTML output. If the tag is left blank doxygen
-# will generate a default style sheet. Note that doxygen will try to copy
-# the style sheet file to the HTML output directory, so don't put your own
-# stylesheet in the HTML output directory as well, or it will be erased!
-
-HTML_STYLESHEET        =
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting
-# this to NO can help when comparing the output of multiple runs.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded. For this to work a browser that supports
-# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
-# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files
-# will be generated that can be used as input for Apple's Xcode 3
-# integrated development environment, introduced with OSX 10.5 (Leopard).
-# To create a documentation set, doxygen will generate a Makefile in the
-# HTML output directory. Running make will produce the docset in that
-# directory and running "make install" will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
-# it at startup.
-# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
-
-GENERATE_DOCSET        = NO
-
-# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
-# feed. A documentation feed provides an umbrella under which multiple
-# documentation sets from a single provider (such as a company or product suite)
-# can be grouped.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
-# should uniquely identify the documentation set bundle. This should be a
-# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
-# will append .docset to the name.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files
-# will be generated that can be used as input for tools like the
-# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP      = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
-# be used to specify the file name of the resulting .chm file. You
-# can add a path in front of the file if the result should not be
-# written to the html output directory.
-
-CHM_FILE               =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
-# be used to specify the location (absolute path including file name) of
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
-# the HTML help compiler on the generated index.hhp.
-
-HHC_LOCATION           =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
-# controls if a separate .chi index file is generated (YES) or that
-# it should be included in the master .chm file (NO).
-
-GENERATE_CHI           = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
-# is used to encode HtmlHelp index (hhk), content (hhc) and project file
-# content.
-
-CHM_INDEX_ENCODING     =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
-# controls whether a binary table of contents is generated (YES) or a
-# normal table of contents (NO) in the .chm file.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members
-# to the contents of the HTML help documentation and to the tree view.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
-# are set, an additional index file will be generated that can be used as input for
-# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
-# HTML documentation.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
-# be used to specify the file name of the resulting .qch file.
-# The path specified is relative to the HTML output folder.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#namespace
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#virtual-folders
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
-# For more information please see
-# http://doc.trolltech.com/qthelpproject.html#custom-filters
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
-# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
-# filter section matches.
-# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
-# be used to specify the location of Qt's qhelpgenerator.
-# If non-empty doxygen will try to run qhelpgenerator on the generated
-# .qhp file.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
-#  will be generated, which together with the HTML files, form an Eclipse help
-#  plugin. To install this plugin and make it available under the help contents
-# menu in Eclipse, the contents of the directory containing the HTML and XML
-# files needs to be copied into the plugins directory of eclipse. The name of
-# the directory within the plugins directory should be the same as
-# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before the help appears.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have
-# this name.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
-# top of each HTML page. The value NO (the default) enables the index and
-# the value YES disables it.
-
-DISABLE_INDEX          = NO
-
-# This tag can be used to set the number of enum values (range [1..20])
-# that doxygen will group on one line in the generated HTML documentation.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information.
-# If the tag value is set to YES, a side panel will be generated
-# containing a tree-like index structure (just like the one that
-# is generated for HTML Help). For this to work a browser that supports
-# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
-# Windows users are probably better off using the HTML help feature.
-
-GENERATE_TREEVIEW      = NO
-
-# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
-# and Class Hierarchy pages using a tree view instead of an ordered list.
-
-USE_INLINE_TREES       = NO
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
-# used to set the initial width (in pixels) of the frame in which the tree
-# is shown.
-
-TREEVIEW_WIDTH         = 250
-
-# Use this tag to change the font size of Latex formulas included
-# as images in the HTML documentation. The default is 10. Note that
-# when you change the font size after a successful doxygen run you need
-# to manually remove any form_*.png images from the HTML output directory
-# to force them to be regenerated.
-
-FORMULA_FONTSIZE       = 10
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for the HTML output. The underlying search engine uses javascript
-# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) there is already a search function so this one should
-# typically be disabled. For large projects the javascript based search engine
-# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be implemented using a PHP enabled web server instead of at the web client using Javascript. Doxygen will generate the search PHP script and index
-# file to put on the web server. The advantage of the server based approach is that it scales better to large projects and allows full text search. The disadvances is that it is more difficult to setup
-# and does not have live searching capabilities.
-
-SERVER_BASED_SEARCH    = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
-# generate Latex output.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `latex' will be used as the default path.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked. If left blank `latex' will be used as the default command name.
-# Note that when enabling USE_PDFLATEX this option is only used for
-# generating bitmaps for formulas in the HTML output, but not in the
-# Makefile that is written to the output directory.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
-# generate index for LaTeX. If left blank `makeindex' will be used as the
-# default command name.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
-# LaTeX documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used
-# by the printer. Possible values are: a4, a4wide, letter, legal and
-# executive. If left blank a4wide will be used.
-
-PAPER_TYPE             = a4wide
-
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
-# packages that should be included in the LaTeX output.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
-# the generated latex document. The header should contain everything until
-# the first chapter. If it is left blank doxygen will generate a
-# standard header. Notice: only use this tag if you know what you are doing!
-
-LATEX_HEADER           =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will
-# contain links (just like the HTML output) instead of page references
-# This makes the output suitable for online browsing using a pdf viewer.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
-# plain latex in the generated Makefile. Set this option to YES to get a
-# higher quality PDF documentation.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
-# command to the generated LaTeX files. This will instruct LaTeX to keep
-# running if errors occur, instead of asking the user for help.
-# This option is also used when generating formulas in HTML.
-
-LATEX_BATCHMODE        = NO
-
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not
-# include the index chapters (such as File Index, Compound Index, etc.)
-# in the output.
-
-LATEX_HIDE_INDICES     = NO
-
-# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER.
-
-LATEX_SOURCE_CODE      = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
-# The RTF output is optimized for Word 97 and may not look very pretty with
-# other RTF readers or editors.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `rtf' will be used as the default path.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
-# RTF documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
-# will contain hyperlink fields. The RTF file will
-# contain links (just like the HTML output) instead of page references.
-# This makes the output suitable for online browsing using WORD or other
-# programs which support those fields.
-# Note: wordpad (write) and others do not support links.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's
-# config file, i.e. a series of assignments. You only have to provide
-# replacements, missing definitions are set to their default value.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an rtf document.
-# Syntax is similar to doxygen's config file.
-
-RTF_EXTENSIONS_FILE    =
-
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
-# generate man pages
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `man' will be used as the default path.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to
-# the generated man pages (default is the subroutine's section .3)
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
-# then it will generate one additional man file for each entity
-# documented in the real man page(s). These additional files
-# only source the real man page, but without them the man command
-# would be unable to find the correct page. The default is NO.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES Doxygen will
-# generate an XML file that captures the structure of
-# the code including all documentation.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `xml' will be used as the default path.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
-# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
-# dump the program listings (including syntax highlighting
-# and cross-referencing information) to the XML output. Note that
-# enabling this will significantly increase the size of the XML output.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
-# generate an AutoGen Definitions (see autogen.sf.net) file
-# that captures the structure of the code including all
-# documentation. Note that this feature is still experimental
-# and incomplete at the moment.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will
-# generate a Perl module file that captures the structure of
-# the code including all documentation. Note that this
-# feature is still experimental and incomplete at the
-# moment.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
-# the necessary Makefile rules, Perl scripts and LaTeX code to be able
-# to generate PDF and DVI output from the Perl module output.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
-# nicely formatted so it can be parsed by a human reader.
-# This is useful
-# if you want to understand what is going on.
-# On the other hand, if this
-# tag is set to NO the size of the Perl module output will be much smaller
-# and Perl will parse it just the same.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
-# This is useful so different doxyrules.make files included by the same
-# Makefile don't overwrite each other's variables.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
-# evaluate all C-preprocessor directives found in the sources and include
-# files.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
-# names in the source code. If set to NO (the default) only conditional
-# compilation will be performed. Macro expansion can be done in a controlled
-# way by setting EXPAND_ONLY_PREDEF to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
-# then the macro expansion is limited to the macros specified with the
-# PREDEFINED and EXPAND_AS_DEFINED tags.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by
-# the preprocessor.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will
-# be used.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that
-# are defined before the preprocessor is started (similar to the -D option of
-# gcc). The argument of the tag is a list of macros of the form: name
-# or name=definition (no spaces). If the definition and the = are
-# omitted =1 is assumed. To prevent a macro definition from being
-# undefined via #undef or recursively expanded use the := operator
-# instead of the = operator.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
-# this tag can be used to specify a list of macro names that should be expanded.
-# The macro definition that is found in the sources will be used.
-# Use the PREDEFINED tag if you want to use a different macro definition.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
-# doxygen's preprocessor will remove all function-like macros that are alone
-# on a line, have an all uppercase name, and do not end with a semicolon. Such
-# function macros are typically used for boiler-plate code, and will confuse
-# the parser if not removed.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES option can be used to specify one or more tagfiles.
-# Optionally an initial location of the external documentation
-# can be added for each tagfile. The format of a tag file without
-# this location is as follows:
-#
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-#
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where "loc1" and "loc2" can be relative or absolute paths or
-# URLs. If a location is present for each tag, the installdox tool
-# does not have to be run to correct the links.
-# Note that each tag file must have a unique name
-# (where the name does NOT include the path)
-# If a tag file is not located in the directory in which doxygen
-# is run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create
-# a tag file that is based on the input files it reads.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed
-# in the class index. If set to NO only the inherited external classes
-# will be listed.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will
-# be listed.
-
-EXTERNAL_GROUPS        = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see
-# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES, the inheritance and collaboration graphs will hide
-# inheritance and usage relations if the target is undocumented
-# or is not a class.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz, a graph visualization
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section
-# have no effect if this option is set to NO (the default)
-
-HAVE_DOT               = NO
-
-# By default doxygen will write a font called FreeSans.ttf to the output
-# directory and reference it in all dot files that doxygen generates. This
-# font does not include all possible unicode characters however, so when you need
-# these (or just want a differently looking font) you can specify the font name
-# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
-# which can be done by putting it in a standard location or by setting the
-# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
-# containing the font.
-
-DOT_FONTNAME           = FreeSans
-
-# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
-# The default size is 10pt.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the output directory to look for the
-# FreeSans.ttf font (which doxygen will put there itself). If you specify a
-# different font using DOT_FONTNAME you can set the path where dot
-# can find it using this tag.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect implementation dependencies (inheritance, containment, and
-# class references variables) of the class with other documented classes.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for groups, showing the direct groups dependencies
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-
-UML_LOOK               = NO
-
-# If set to YES, the inheritance and collaboration graphs will show the
-# relations between templates and their instances.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
-# tags are set to YES then doxygen will generate a graph for each documented
-# file showing the direct and indirect include dependencies of the file with
-# other documented files.
-
-INCLUDE_GRAPH          = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
-# documented header file showing the documented files that directly or
-# indirectly include this file.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH and HAVE_DOT options are set to YES then
-# doxygen will generate a call dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable call graphs
-# for selected functions only using the \callgraph command.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
-# doxygen will generate a caller dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable caller
-# graphs for selected functions only using the \callergraph command.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
-# will graphical hierarchy of all classes instead of a textual one.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
-# then doxygen will show the dependencies a directory has on other directories
-# in a graphical way. The dependency relations are determined by the #include
-# relations between the files in the directories.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. Possible values are png, jpg, or gif
-# If left blank png will be used.
-
-DOT_IMAGE_FORMAT       = png
-
-# The tag DOT_PATH can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the
-# \dotfile command).
-
-DOTFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
-# nodes that will be shown in the graph. If the number of nodes in a graph
-# becomes larger than this value, doxygen will truncate the graph, which is
-# visualized by representing a node as a red box. Note that doxygen if the
-# number of direct children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
-# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
-# graphs generated by dot. A depth value of 3 means that only nodes reachable
-# from the root by following a path via at most 3 edges will be shown. Nodes
-# that lay further from the root node will be omitted. Note that setting this
-# option to 1 or 2 may greatly reduce the computation time needed for large
-# code bases. Also note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not
-# seem to support this out of the box. Warning: Depending on the platform used,
-# enabling this option may lead to badly anti-aliased labels on the edges of
-# a graph (i.e. they become hard to read).
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10)
-# support this, this feature is disabled by default.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
-# generate a legend page explaining the meaning of the various boxes and
-# arrows in the dot generated graphs.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
-# remove the intermediate dot files that are used to generate
-# the various graphs.
-
-DOT_CLEANUP            = YES
diff --git a/nestegg/docs/Makefile.am b/nestegg/docs/Makefile.am
deleted file mode 100644
index 42cf8ee..0000000
--- a/nestegg/docs/Makefile.am
+++ /dev/null
@@ -1,38 +0,0 @@
-doc_DATA = doxygen-build.stamp
-
-EXTRA_DIST = Doxyfile.in
-
-if HAVE_DOXYGEN
-doxygen-build.stamp: Doxyfile
-	doxygen
-	touch doxygen-build.stamp
-else
-doxygen-build.stamp:
-	echo "*** Warning: Doxygen not found; documentation will not be built."
-	touch doxygen-build.stamp
-endif
-
-dist_docdir = $(distdir)/libnestegg
-
-dist-hook:
-	if test -d html; then \
-	  mkdir $(dist_docdir); \
-	  echo -n "copying built documenation..."; \
-	  cp -rp html $(dist_docdir)/html; \
-	  echo "OK"; \
-	fi
-
-
-install-data-local: doxygen-build.stamp
-	$(mkinstalldirs) $(DESTDIR)$(docdir)
-	if test -d html; then \
-	  cp -rp html $(DESTDIR)$(docdir)/html; \
-	fi
-
-uninstall-local:
-	rm -rf $(DESTDIR)$(docdir)
-
-clean-local:
-	if test -d html; then rm -rf html; fi
-	if test -f doxygen-build.stamp; then rm -f doxygen-build.stamp; fi
-
diff --git a/nestegg/m4/as-ac-expand.m4 b/nestegg/m4/as-ac-expand.m4
deleted file mode 100644
index d6c9e33..0000000
--- a/nestegg/m4/as-ac-expand.m4
+++ /dev/null
@@ -1,43 +0,0 @@
-dnl as-ac-expand.m4 0.2.0
-dnl autostars m4 macro for expanding directories using configure's prefix
-dnl thomas@apestaart.org
-
-dnl AS_AC_EXPAND(VAR, CONFIGURE_VAR)
-dnl example
-dnl AS_AC_EXPAND(SYSCONFDIR, $sysconfdir)
-dnl will set SYSCONFDIR to /usr/local/etc if prefix=/usr/local
-
-AC_DEFUN([AS_AC_EXPAND],
-[
-  EXP_VAR=[$1]
-  FROM_VAR=[$2]
-
-  dnl first expand prefix and exec_prefix if necessary
-  prefix_save=$prefix
-  exec_prefix_save=$exec_prefix
-
-  dnl if no prefix given, then use /usr/local, the default prefix
-  if test "x$prefix" = "xNONE"; then
-    prefix="$ac_default_prefix"
-  fi
-  dnl if no exec_prefix given, then use prefix
-  if test "x$exec_prefix" = "xNONE"; then
-    exec_prefix=$prefix
-  fi
-
-  full_var="$FROM_VAR"
-  dnl loop until it doesn't change anymore
-  while true; do
-    new_full_var="`eval echo $full_var`"
-    if test "x$new_full_var" = "x$full_var"; then break; fi
-    full_var=$new_full_var
-  done
-
-  dnl clean up
-  full_var=$new_full_var
-  AC_SUBST([$1], "$full_var")
-
-  dnl restore prefix and exec_prefix
-  prefix=$prefix_save
-  exec_prefix=$exec_prefix_save
-])
diff --git a/nestegg/m4/ax_create_stdint_h.m4 b/nestegg/m4/ax_create_stdint_h.m4
deleted file mode 100644
index 228105b..0000000
--- a/nestegg/m4/ax_create_stdint_h.m4
+++ /dev/null
@@ -1,695 +0,0 @@
-dnl @synopsis AX_CREATE_STDINT_H [( HEADER-TO-GENERATE [, HEDERS-TO-CHECK])]
-dnl
-dnl the "ISO C9X: 7.18 Integer types <stdint.h>" section requires the
-dnl existence of an include file <stdint.h> that defines a set of
-dnl typedefs, especially uint8_t,int32_t,uintptr_t. Many older
-dnl installations will not provide this file, but some will have the
-dnl very same definitions in <inttypes.h>. In other enviroments we can
-dnl use the inet-types in <sys/types.h> which would define the typedefs
-dnl int8_t and u_int8_t respectivly.
-dnl
-dnl This macros will create a local "_stdint.h" or the headerfile given
-dnl as an argument. In many cases that file will just "#include
-dnl <stdint.h>" or "#include <inttypes.h>", while in other environments
-dnl it will provide the set of basic 'stdint's definitions/typedefs:
-dnl
-dnl   int8_t,uint8_t,int16_t,uint16_t,int32_t,uint32_t,intptr_t,uintptr_t
-dnl   int_least32_t.. int_fast32_t.. intmax_t
-dnl
-dnl which may or may not rely on the definitions of other files, or
-dnl using the AC_CHECK_SIZEOF macro to determine the actual sizeof each
-dnl type.
-dnl
-dnl if your header files require the stdint-types you will want to
-dnl create an installable file mylib-int.h that all your other
-dnl installable header may include. So if you have a library package
-dnl named "mylib", just use
-dnl
-dnl      AX_CREATE_STDINT_H(mylib-int.h)
-dnl
-dnl in configure.ac and go to install that very header file in
-dnl Makefile.am along with the other headers (mylib.h) - and the
-dnl mylib-specific headers can simply use "#include <mylib-int.h>" to
-dnl obtain the stdint-types.
-dnl
-dnl Remember, if the system already had a valid <stdint.h>, the
-dnl generated file will include it directly. No need for fuzzy
-dnl HAVE_STDINT_H things... (oops, GCC 4.2.x has deliberatly disabled
-dnl its stdint.h for non-c99 compilation and the c99-mode is not the
-dnl default. Therefore this macro will not use the compiler's stdint.h
-dnl - please complain to the GCC developers).
-dnl
-dnl @category C
-dnl @author Guido U. Draheim <guidod@gmx.de>
-dnl @version 2006-10-13
-dnl @license GPLWithACException
-
-AC_DEFUN([AX_CHECK_DATA_MODEL],[
-   AC_CHECK_SIZEOF(char)
-   AC_CHECK_SIZEOF(short)
-   AC_CHECK_SIZEOF(int)
-   AC_CHECK_SIZEOF(long)
-   AC_CHECK_SIZEOF(void*)
-   ac_cv_char_data_model=""
-   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_char"
-   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_short"
-   ac_cv_char_data_model="$ac_cv_char_data_model$ac_cv_sizeof_int"
-   ac_cv_long_data_model=""
-   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_int"
-   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_long"
-   ac_cv_long_data_model="$ac_cv_long_data_model$ac_cv_sizeof_voidp"
-   AC_MSG_CHECKING([data model])
-   case "$ac_cv_char_data_model/$ac_cv_long_data_model" in
-    122/242)     ac_cv_data_model="IP16"  ; n="standard 16bit machine" ;;
-    122/244)     ac_cv_data_model="LP32"  ; n="standard 32bit machine" ;;
-    122/*)       ac_cv_data_model="i16"   ; n="unusual int16 model" ;;
-    124/444)     ac_cv_data_model="ILP32" ; n="standard 32bit unixish" ;;
-    124/488)     ac_cv_data_model="LP64"  ; n="standard 64bit unixish" ;;
-    124/448)     ac_cv_data_model="LLP64" ; n="unusual 64bit unixish" ;;
-    124/*)       ac_cv_data_model="i32"   ; n="unusual int32 model" ;;
-    128/888)     ac_cv_data_model="ILP64" ; n="unusual 64bit numeric" ;;
-    128/*)       ac_cv_data_model="i64"   ; n="unusual int64 model" ;;
-    222/*2)      ac_cv_data_model="DSP16" ; n="strict 16bit dsptype" ;;
-    333/*3)      ac_cv_data_model="DSP24" ; n="strict 24bit dsptype" ;;
-    444/*4)      ac_cv_data_model="DSP32" ; n="strict 32bit dsptype" ;;
-    666/*6)      ac_cv_data_model="DSP48" ; n="strict 48bit dsptype" ;;
-    888/*8)      ac_cv_data_model="DSP64" ; n="strict 64bit dsptype" ;;
-    222/*|333/*|444/*|666/*|888/*) :
-                 ac_cv_data_model="iDSP"  ; n="unusual dsptype" ;;
-     *)          ac_cv_data_model="none"  ; n="very unusual model" ;;
-   esac
-   AC_MSG_RESULT([$ac_cv_data_model ($ac_cv_long_data_model, $n)])
-])
-
-dnl AX_CHECK_HEADER_STDINT_X([HEADERLIST][,ACTION-IF])
-AC_DEFUN([AX_CHECK_HEADER_STDINT_X],[
-AC_CACHE_CHECK([for stdint uintptr_t], [ac_cv_header_stdint_x],[
- ac_cv_header_stdint_x="" # the 1997 typedefs (inttypes.h)
-  AC_MSG_RESULT([(..)])
-  for i in m4_ifval([$1],[$1],[stdint.h inttypes.h sys/inttypes.h sys/types.h])
-  do
-   unset ac_cv_type_uintptr_t
-   unset ac_cv_type_uint64_t
-   AC_CHECK_TYPE(uintptr_t,[ac_cv_header_stdint_x=$i],continue,[#include <$i>])
-   AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>])
-   m4_ifvaln([$1],[$1]) break
-  done
-  AC_MSG_CHECKING([for stdint uintptr_t])
- ])
-])
-
-AC_DEFUN([AX_CHECK_HEADER_STDINT_O],[
-AC_CACHE_CHECK([for stdint uint32_t], [ac_cv_header_stdint_o],[
- ac_cv_header_stdint_o="" # the 1995 typedefs (sys/inttypes.h)
-  AC_MSG_RESULT([(..)])
-  for i in m4_ifval([$1],[$1],[inttypes.h sys/inttypes.h sys/types.h stdint.h])
-  do
-   unset ac_cv_type_uint32_t
-   unset ac_cv_type_uint64_t
-   AC_CHECK_TYPE(uint32_t,[ac_cv_header_stdint_o=$i],continue,[#include <$i>])
-   AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>])
-   m4_ifvaln([$1],[$1]) break
-   break;
-  done
-  AC_MSG_CHECKING([for stdint uint32_t])
- ])
-])
-
-AC_DEFUN([AX_CHECK_HEADER_STDINT_U],[
-AC_CACHE_CHECK([for stdint u_int32_t], [ac_cv_header_stdint_u],[
- ac_cv_header_stdint_u="" # the BSD typedefs (sys/types.h)
-  AC_MSG_RESULT([(..)])
-  for i in m4_ifval([$1],[$1],[sys/types.h inttypes.h sys/inttypes.h]) ; do
-   unset ac_cv_type_u_int32_t
-   unset ac_cv_type_u_int64_t
-   AC_CHECK_TYPE(u_int32_t,[ac_cv_header_stdint_u=$i],continue,[#include <$i>])
-   AC_CHECK_TYPE(u_int64_t,[and64="/u_int64_t"],[and64=""],[#include<$i>])
-   m4_ifvaln([$1],[$1]) break
-   break;
-  done
-  AC_MSG_CHECKING([for stdint u_int32_t])
- ])
-])
-
-AC_DEFUN([AX_CREATE_STDINT_H],
-[# ------ AX CREATE STDINT H -------------------------------------
-AC_MSG_CHECKING([for stdint types])
-ac_stdint_h=`echo ifelse($1, , _stdint.h, $1)`
-# try to shortcircuit - if the default include path of the compiler
-# can find a "stdint.h" header then we assume that all compilers can.
-AC_CACHE_VAL([ac_cv_header_stdint_t],[
-old_CXXFLAGS="$CXXFLAGS" ; CXXFLAGS=""
-old_CPPFLAGS="$CPPFLAGS" ; CPPFLAGS=""
-old_CFLAGS="$CFLAGS"     ; CFLAGS=""
-AC_TRY_COMPILE([#include <stdint.h>],[int_least32_t v = 0;],
-[ac_cv_stdint_result="(assuming C99 compatible system)"
- ac_cv_header_stdint_t="stdint.h"; ],
-[ac_cv_header_stdint_t=""])
-if test "$GCC" = "yes" && test ".$ac_cv_header_stdint_t" = "."; then
-CFLAGS="-std=c99"
-AC_TRY_COMPILE([#include <stdint.h>],[int_least32_t v = 0;],
-[AC_MSG_WARN(your GCC compiler has a defunct stdint.h for its default-mode)])
-fi
-CXXFLAGS="$old_CXXFLAGS"
-CPPFLAGS="$old_CPPFLAGS"
-CFLAGS="$old_CFLAGS" ])
-
-v="... $ac_cv_header_stdint_h"
-if test "$ac_stdint_h" = "stdint.h" ; then
- AC_MSG_RESULT([(are you sure you want them in ./stdint.h?)])
-elif test "$ac_stdint_h" = "inttypes.h" ; then
- AC_MSG_RESULT([(are you sure you want them in ./inttypes.h?)])
-elif test "_$ac_cv_header_stdint_t" = "_" ; then
- AC_MSG_RESULT([(putting them into $ac_stdint_h)$v])
-else
- ac_cv_header_stdint="$ac_cv_header_stdint_t"
- AC_MSG_RESULT([$ac_cv_header_stdint (shortcircuit)])
-fi
-
-if test "_$ac_cv_header_stdint_t" = "_" ; then # can not shortcircuit..
-
-dnl .....intro message done, now do a few system checks.....
-dnl btw, all old CHECK_TYPE macros do automatically "DEFINE" a type,
-dnl therefore we use the autoconf implementation detail CHECK_TYPE_NEW
-dnl instead that is triggered with 3 or more arguments (see types.m4)
-
-inttype_headers=`echo $2 | sed -e 's/,/ /g'`
-
-ac_cv_stdint_result="(no helpful system typedefs seen)"
-AX_CHECK_HEADER_STDINT_X(dnl
-   stdint.h inttypes.h sys/inttypes.h $inttype_headers,
-   ac_cv_stdint_result="(seen uintptr_t$and64 in $i)")
-
-if test "_$ac_cv_header_stdint_x" = "_" ; then
-AX_CHECK_HEADER_STDINT_O(dnl,
-   inttypes.h sys/inttypes.h stdint.h $inttype_headers,
-   ac_cv_stdint_result="(seen uint32_t$and64 in $i)")
-fi
-
-if test "_$ac_cv_header_stdint_x" = "_" ; then
-if test "_$ac_cv_header_stdint_o" = "_" ; then
-AX_CHECK_HEADER_STDINT_U(dnl,
-   sys/types.h inttypes.h sys/inttypes.h $inttype_headers,
-   ac_cv_stdint_result="(seen u_int32_t$and64 in $i)")
-fi fi
-
-dnl if there was no good C99 header file, do some typedef checks...
-if test "_$ac_cv_header_stdint_x" = "_" ; then
-   AC_MSG_CHECKING([for stdint datatype model])
-   AC_MSG_RESULT([(..)])
-   AX_CHECK_DATA_MODEL
-fi
-
-if test "_$ac_cv_header_stdint_x" != "_" ; then
-   ac_cv_header_stdint="$ac_cv_header_stdint_x"
-elif  test "_$ac_cv_header_stdint_o" != "_" ; then
-   ac_cv_header_stdint="$ac_cv_header_stdint_o"
-elif  test "_$ac_cv_header_stdint_u" != "_" ; then
-   ac_cv_header_stdint="$ac_cv_header_stdint_u"
-else
-   ac_cv_header_stdint="stddef.h"
-fi
-
-AC_MSG_CHECKING([for extra inttypes in chosen header])
-AC_MSG_RESULT([($ac_cv_header_stdint)])
-dnl see if int_least and int_fast types are present in _this_ header.
-unset ac_cv_type_int_least32_t
-unset ac_cv_type_int_fast32_t
-AC_CHECK_TYPE(int_least32_t,,,[#include <$ac_cv_header_stdint>])
-AC_CHECK_TYPE(int_fast32_t,,,[#include<$ac_cv_header_stdint>])
-AC_CHECK_TYPE(intmax_t,,,[#include <$ac_cv_header_stdint>])
-
-fi # shortcircut to system "stdint.h"
-# ------------------ PREPARE VARIABLES ------------------------------
-if test "$GCC" = "yes" ; then
-ac_cv_stdint_message="using gnu compiler "`$CC --version | head -1`
-else
-ac_cv_stdint_message="using $CC"
-fi
-
-AC_MSG_RESULT([make use of $ac_cv_header_stdint in $ac_stdint_h dnl
-$ac_cv_stdint_result])
-
-dnl -----------------------------------------------------------------
-# ----------------- DONE inttypes.h checks START header -------------
-AC_CONFIG_COMMANDS([$ac_stdint_h],[
-AC_MSG_NOTICE(creating $ac_stdint_h : $_ac_stdint_h)
-ac_stdint=$tmp/_stdint.h
-
-echo "#ifndef" $_ac_stdint_h >$ac_stdint
-echo "#define" $_ac_stdint_h "1" >>$ac_stdint
-echo "#ifndef" _GENERATED_STDINT_H >>$ac_stdint
-echo "#define" _GENERATED_STDINT_H '"'$PACKAGE $VERSION'"' >>$ac_stdint
-echo "/* generated $ac_cv_stdint_message */" >>$ac_stdint
-if test "_$ac_cv_header_stdint_t" != "_" ; then
-echo "#define _STDINT_HAVE_STDINT_H" "1" >>$ac_stdint
-echo "#include <stdint.h>" >>$ac_stdint
-echo "#endif" >>$ac_stdint
-echo "#endif" >>$ac_stdint
-else
-
-cat >>$ac_stdint <<STDINT_EOF
-
-/* ................... shortcircuit part ........................... */
-
-#if defined HAVE_STDINT_H || defined _STDINT_HAVE_STDINT_H
-#include <stdint.h>
-#else
-#include <stddef.h>
-
-/* .................... configured part ............................ */
-
-STDINT_EOF
-
-echo "/* whether we have a C99 compatible stdint header file */" >>$ac_stdint
-if test "_$ac_cv_header_stdint_x" != "_" ; then
-  ac_header="$ac_cv_header_stdint_x"
-  echo "#define _STDINT_HEADER_INTPTR" '"'"$ac_header"'"' >>$ac_stdint
-else
-  echo "/* #undef _STDINT_HEADER_INTPTR */" >>$ac_stdint
-fi
-
-echo "/* whether we have a C96 compatible inttypes header file */" >>$ac_stdint
-if  test "_$ac_cv_header_stdint_o" != "_" ; then
-  ac_header="$ac_cv_header_stdint_o"
-  echo "#define _STDINT_HEADER_UINT32" '"'"$ac_header"'"' >>$ac_stdint
-else
-  echo "/* #undef _STDINT_HEADER_UINT32 */" >>$ac_stdint
-fi
-
-echo "/* whether we have a BSD compatible inet types header */" >>$ac_stdint
-if  test "_$ac_cv_header_stdint_u" != "_" ; then
-  ac_header="$ac_cv_header_stdint_u"
-  echo "#define _STDINT_HEADER_U_INT32" '"'"$ac_header"'"' >>$ac_stdint
-else
-  echo "/* #undef _STDINT_HEADER_U_INT32 */" >>$ac_stdint
-fi
-
-echo "" >>$ac_stdint
-
-if test "_$ac_header" != "_" ; then if test "$ac_header" != "stddef.h" ; then
-  echo "#include <$ac_header>" >>$ac_stdint
-  echo "" >>$ac_stdint
-fi fi
-
-echo "/* which 64bit typedef has been found */" >>$ac_stdint
-if test "$ac_cv_type_uint64_t" = "yes" ; then
-echo "#define   _STDINT_HAVE_UINT64_T" "1"  >>$ac_stdint
-else
-echo "/* #undef _STDINT_HAVE_UINT64_T */" >>$ac_stdint
-fi
-if test "$ac_cv_type_u_int64_t" = "yes" ; then
-echo "#define   _STDINT_HAVE_U_INT64_T" "1"  >>$ac_stdint
-else
-echo "/* #undef _STDINT_HAVE_U_INT64_T */" >>$ac_stdint
-fi
-echo "" >>$ac_stdint
-
-echo "/* which type model has been detected */" >>$ac_stdint
-if test "_$ac_cv_char_data_model" != "_" ; then
-echo "#define   _STDINT_CHAR_MODEL" "$ac_cv_char_data_model" >>$ac_stdint
-echo "#define   _STDINT_LONG_MODEL" "$ac_cv_long_data_model" >>$ac_stdint
-else
-echo "/* #undef _STDINT_CHAR_MODEL // skipped */" >>$ac_stdint
-echo "/* #undef _STDINT_LONG_MODEL // skipped */" >>$ac_stdint
-fi
-echo "" >>$ac_stdint
-
-echo "/* whether int_least types were detected */" >>$ac_stdint
-if test "$ac_cv_type_int_least32_t" = "yes"; then
-echo "#define   _STDINT_HAVE_INT_LEAST32_T" "1"  >>$ac_stdint
-else
-echo "/* #undef _STDINT_HAVE_INT_LEAST32_T */" >>$ac_stdint
-fi
-echo "/* whether int_fast types were detected */" >>$ac_stdint
-if test "$ac_cv_type_int_fast32_t" = "yes"; then
-echo "#define   _STDINT_HAVE_INT_FAST32_T" "1" >>$ac_stdint
-else
-echo "/* #undef _STDINT_HAVE_INT_FAST32_T */" >>$ac_stdint
-fi
-echo "/* whether intmax_t type was detected */" >>$ac_stdint
-if test "$ac_cv_type_intmax_t" = "yes"; then
-echo "#define   _STDINT_HAVE_INTMAX_T" "1" >>$ac_stdint
-else
-echo "/* #undef _STDINT_HAVE_INTMAX_T */" >>$ac_stdint
-fi
-echo "" >>$ac_stdint
-
-  cat >>$ac_stdint <<STDINT_EOF
-/* .................... detections part ............................ */
-
-/* whether we need to define bitspecific types from compiler base types */
-#ifndef _STDINT_HEADER_INTPTR
-#ifndef _STDINT_HEADER_UINT32
-#ifndef _STDINT_HEADER_U_INT32
-#define _STDINT_NEED_INT_MODEL_T
-#else
-#define _STDINT_HAVE_U_INT_TYPES
-#endif
-#endif
-#endif
-
-#ifdef _STDINT_HAVE_U_INT_TYPES
-#undef _STDINT_NEED_INT_MODEL_T
-#endif
-
-#ifdef  _STDINT_CHAR_MODEL
-#if     _STDINT_CHAR_MODEL+0 == 122 || _STDINT_CHAR_MODEL+0 == 124
-#ifndef _STDINT_BYTE_MODEL
-#define _STDINT_BYTE_MODEL 12
-#endif
-#endif
-#endif
-
-#ifndef _STDINT_HAVE_INT_LEAST32_T
-#define _STDINT_NEED_INT_LEAST_T
-#endif
-
-#ifndef _STDINT_HAVE_INT_FAST32_T
-#define _STDINT_NEED_INT_FAST_T
-#endif
-
-#ifndef _STDINT_HEADER_INTPTR
-#define _STDINT_NEED_INTPTR_T
-#ifndef _STDINT_HAVE_INTMAX_T
-#define _STDINT_NEED_INTMAX_T
-#endif
-#endif
-
-
-/* .................... definition part ............................ */
-
-/* some system headers have good uint64_t */
-#ifndef _HAVE_UINT64_T
-#if     defined _STDINT_HAVE_UINT64_T  || defined HAVE_UINT64_T
-#define _HAVE_UINT64_T
-#elif   defined _STDINT_HAVE_U_INT64_T || defined HAVE_U_INT64_T
-#define _HAVE_UINT64_T
-typedef u_int64_t uint64_t;
-#endif
-#endif
-
-#ifndef _HAVE_UINT64_T
-/* .. here are some common heuristics using compiler runtime specifics */
-#if defined __STDC_VERSION__ && defined __STDC_VERSION__ >= 199901L
-#define _HAVE_UINT64_T
-#define _HAVE_LONGLONG_UINT64_T
-typedef long long int64_t;
-typedef unsigned long long uint64_t;
-
-#elif !defined __STRICT_ANSI__
-#if defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__
-#define _HAVE_UINT64_T
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-
-#elif defined __GNUC__ || defined __MWERKS__ || defined __ELF__
-/* note: all ELF-systems seem to have loff-support which needs 64-bit */
-#if !defined _NO_LONGLONG
-#define _HAVE_UINT64_T
-#define _HAVE_LONGLONG_UINT64_T
-typedef long long int64_t;
-typedef unsigned long long uint64_t;
-#endif
-
-#elif defined __alpha || (defined __mips && defined _ABIN32)
-#if !defined _NO_LONGLONG
-typedef long int64_t;
-typedef unsigned long uint64_t;
-#endif
-  /* compiler/cpu type to define int64_t */
-#endif
-#endif
-#endif
-
-#if defined _STDINT_HAVE_U_INT_TYPES
-/* int8_t int16_t int32_t defined by inet code, redeclare the u_intXX types */
-typedef u_int8_t uint8_t;
-typedef u_int16_t uint16_t;
-typedef u_int32_t uint32_t;
-
-/* glibc compatibility */
-#ifndef __int8_t_defined
-#define __int8_t_defined
-#endif
-#endif
-
-#ifdef _STDINT_NEED_INT_MODEL_T
-/* we must guess all the basic types. Apart from byte-adressable system, */
-/* there a few 32-bit-only dsp-systems that we guard with BYTE_MODEL 8-} */
-/* (btw, those nibble-addressable systems are way off, or so we assume) */
-
-dnl   /* have a look at "64bit and data size neutrality" at */
-dnl   /* http://unix.org/version2/whatsnew/login_64bit.html */
-dnl   /* (the shorthand "ILP" types always have a "P" part) */
-
-#if defined _STDINT_BYTE_MODEL
-#if _STDINT_LONG_MODEL+0 == 242
-/* 2:4:2 =  IP16 = a normal 16-bit system                */
-typedef unsigned char   uint8_t;
-typedef unsigned short  uint16_t;
-typedef unsigned long   uint32_t;
-#ifndef __int8_t_defined
-#define __int8_t_defined
-typedef          char    int8_t;
-typedef          short   int16_t;
-typedef          long    int32_t;
-#endif
-#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL == 444
-/* 2:4:4 =  LP32 = a 32-bit system derived from a 16-bit */
-/* 4:4:4 = ILP32 = a normal 32-bit system                */
-typedef unsigned char   uint8_t;
-typedef unsigned short  uint16_t;
-typedef unsigned int    uint32_t;
-#ifndef __int8_t_defined
-#define __int8_t_defined
-typedef          char    int8_t;
-typedef          short   int16_t;
-typedef          int     int32_t;
-#endif
-#elif _STDINT_LONG_MODEL+0 == 484 || _STDINT_LONG_MODEL+0 == 488
-/* 4:8:4 =  IP32 = a 32-bit system prepared for 64-bit    */
-/* 4:8:8 =  LP64 = a normal 64-bit system                 */
-typedef unsigned char   uint8_t;
-typedef unsigned short  uint16_t;
-typedef unsigned int    uint32_t;
-#ifndef __int8_t_defined
-#define __int8_t_defined
-typedef          char    int8_t;
-typedef          short   int16_t;
-typedef          int     int32_t;
-#endif
-/* this system has a "long" of 64bit */
-#ifndef _HAVE_UINT64_T
-#define _HAVE_UINT64_T
-typedef unsigned long   uint64_t;
-typedef          long    int64_t;
-#endif
-#elif _STDINT_LONG_MODEL+0 == 448
-/*      LLP64   a 64-bit system derived from a 32-bit system */
-typedef unsigned char   uint8_t;
-typedef unsigned short  uint16_t;
-typedef unsigned int    uint32_t;
-#ifndef __int8_t_defined
-#define __int8_t_defined
-typedef          char    int8_t;
-typedef          short   int16_t;
-typedef          int     int32_t;
-#endif
-/* assuming the system has a "long long" */
-#ifndef _HAVE_UINT64_T
-#define _HAVE_UINT64_T
-#define _HAVE_LONGLONG_UINT64_T
-typedef unsigned long long uint64_t;
-typedef          long long  int64_t;
-#endif
-#else
-#define _STDINT_NO_INT32_T
-#endif
-#else
-#define _STDINT_NO_INT8_T
-#define _STDINT_NO_INT32_T
-#endif
-#endif
-
-/*
- * quote from SunOS-5.8 sys/inttypes.h:
- * Use at your own risk.  As of February 1996, the committee is squarely
- * behind the fixed sized types; the "least" and "fast" types are still being
- * discussed.  The probability that the "fast" types may be removed before
- * the standard is finalized is high enough that they are not currently
- * implemented.
- */
-
-#if defined _STDINT_NEED_INT_LEAST_T
-typedef  int8_t    int_least8_t;
-typedef  int16_t   int_least16_t;
-typedef  int32_t   int_least32_t;
-#ifdef _HAVE_UINT64_T
-typedef  int64_t   int_least64_t;
-#endif
-
-typedef uint8_t   uint_least8_t;
-typedef uint16_t  uint_least16_t;
-typedef uint32_t  uint_least32_t;
-#ifdef _HAVE_UINT64_T
-typedef uint64_t  uint_least64_t;
-#endif
-  /* least types */
-#endif
-
-#if defined _STDINT_NEED_INT_FAST_T
-typedef  int8_t    int_fast8_t;
-typedef  int       int_fast16_t;
-typedef  int32_t   int_fast32_t;
-#ifdef _HAVE_UINT64_T
-typedef  int64_t   int_fast64_t;
-#endif
-
-typedef uint8_t   uint_fast8_t;
-typedef unsigned  uint_fast16_t;
-typedef uint32_t  uint_fast32_t;
-#ifdef _HAVE_UINT64_T
-typedef uint64_t  uint_fast64_t;
-#endif
-  /* fast types */
-#endif
-
-#ifdef _STDINT_NEED_INTMAX_T
-#ifdef _HAVE_UINT64_T
-typedef  int64_t       intmax_t;
-typedef uint64_t      uintmax_t;
-#else
-typedef          long  intmax_t;
-typedef unsigned long uintmax_t;
-#endif
-#endif
-
-#ifdef _STDINT_NEED_INTPTR_T
-#ifndef __intptr_t_defined
-#define __intptr_t_defined
-/* we encourage using "long" to store pointer values, never use "int" ! */
-#if   _STDINT_LONG_MODEL+0 == 242 || _STDINT_LONG_MODEL+0 == 484
-typedef  unsigned int   uintptr_t;
-typedef           int    intptr_t;
-#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL+0 == 444
-typedef  unsigned long  uintptr_t;
-typedef           long   intptr_t;
-#elif _STDINT_LONG_MODEL+0 == 448 && defined _HAVE_UINT64_T
-typedef        uint64_t uintptr_t;
-typedef         int64_t  intptr_t;
-#else /* matches typical system types ILP32 and LP64 - but not IP16 or LLP64 */
-typedef  unsigned long  uintptr_t;
-typedef           long   intptr_t;
-#endif
-#endif
-#endif
-
-/* The ISO C99 standard specifies that in C++ implementations these
-   should only be defined if explicitly requested.  */
-#if !defined __cplusplus || defined __STDC_CONSTANT_MACROS
-#ifndef UINT32_C
-
-/* Signed.  */
-# define INT8_C(c)      c
-# define INT16_C(c)     c
-# define INT32_C(c)     c
-# ifdef _HAVE_LONGLONG_UINT64_T
-#  define INT64_C(c)    c ## L
-# else
-#  define INT64_C(c)    c ## LL
-# endif
-
-/* Unsigned.  */
-# define UINT8_C(c)     c ## U
-# define UINT16_C(c)    c ## U
-# define UINT32_C(c)    c ## U
-# ifdef _HAVE_LONGLONG_UINT64_T
-#  define UINT64_C(c)   c ## UL
-# else
-#  define UINT64_C(c)   c ## ULL
-# endif
-
-/* Maximal type.  */
-# ifdef _HAVE_LONGLONG_UINT64_T
-#  define INTMAX_C(c)   c ## L
-#  define UINTMAX_C(c)  c ## UL
-# else
-#  define INTMAX_C(c)   c ## LL
-#  define UINTMAX_C(c)  c ## ULL
-# endif
-
-  /* literalnumbers */
-#endif
-#endif
-
-/* These limits are merily those of a two complement byte-oriented system */
-
-/* Minimum of signed integral types.  */
-# define INT8_MIN               (-128)
-# define INT16_MIN              (-32767-1)
-# define INT32_MIN              (-2147483647-1)
-# define INT64_MIN              (-__INT64_C(9223372036854775807)-1)
-/* Maximum of signed integral types.  */
-# define INT8_MAX               (127)
-# define INT16_MAX              (32767)
-# define INT32_MAX              (2147483647)
-# define INT64_MAX              (__INT64_C(9223372036854775807))
-
-/* Maximum of unsigned integral types.  */
-# define UINT8_MAX              (255)
-# define UINT16_MAX             (65535)
-# define UINT32_MAX             (4294967295U)
-# define UINT64_MAX             (__UINT64_C(18446744073709551615))
-
-/* Minimum of signed integral types having a minimum size.  */
-# define INT_LEAST8_MIN         INT8_MIN
-# define INT_LEAST16_MIN        INT16_MIN
-# define INT_LEAST32_MIN        INT32_MIN
-# define INT_LEAST64_MIN        INT64_MIN
-/* Maximum of signed integral types having a minimum size.  */
-# define INT_LEAST8_MAX         INT8_MAX
-# define INT_LEAST16_MAX        INT16_MAX
-# define INT_LEAST32_MAX        INT32_MAX
-# define INT_LEAST64_MAX        INT64_MAX
-
-/* Maximum of unsigned integral types having a minimum size.  */
-# define UINT_LEAST8_MAX        UINT8_MAX
-# define UINT_LEAST16_MAX       UINT16_MAX
-# define UINT_LEAST32_MAX       UINT32_MAX
-# define UINT_LEAST64_MAX       UINT64_MAX
-
-  /* shortcircuit*/
-#endif
-  /* once */
-#endif
-#endif
-STDINT_EOF
-fi
-    if cmp -s $ac_stdint_h $ac_stdint 2>/dev/null; then
-      AC_MSG_NOTICE([$ac_stdint_h is unchanged])
-    else
-      ac_dir=`AS_DIRNAME(["$ac_stdint_h"])`
-      AS_MKDIR_P(["$ac_dir"])
-      rm -f $ac_stdint_h
-      mv $ac_stdint $ac_stdint_h
-    fi
-],[# variables for create stdint.h replacement
-PACKAGE="$PACKAGE"
-VERSION="$VERSION"
-ac_stdint_h="$ac_stdint_h"
-_ac_stdint_h=AS_TR_CPP(_$PACKAGE-$ac_stdint_h)
-ac_cv_stdint_message="$ac_cv_stdint_message"
-ac_cv_header_stdint_t="$ac_cv_header_stdint_t"
-ac_cv_header_stdint_x="$ac_cv_header_stdint_x"
-ac_cv_header_stdint_o="$ac_cv_header_stdint_o"
-ac_cv_header_stdint_u="$ac_cv_header_stdint_u"
-ac_cv_type_uint64_t="$ac_cv_type_uint64_t"
-ac_cv_type_u_int64_t="$ac_cv_type_u_int64_t"
-ac_cv_char_data_model="$ac_cv_char_data_model"
-ac_cv_long_data_model="$ac_cv_long_data_model"
-ac_cv_type_int_least32_t="$ac_cv_type_int_least32_t"
-ac_cv_type_int_fast32_t="$ac_cv_type_int_fast32_t"
-ac_cv_type_intmax_t="$ac_cv_type_intmax_t"
-])
-])
diff --git a/nestegg/m4/pkg.m4 b/nestegg/m4/pkg.m4
deleted file mode 100644
index 996e294..0000000
--- a/nestegg/m4/pkg.m4
+++ /dev/null
@@ -1,157 +0,0 @@
-# pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
-#
-# Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# PKG_PROG_PKG_CONFIG([MIN-VERSION])
-# ----------------------------------
-AC_DEFUN([PKG_PROG_PKG_CONFIG],
-[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
-m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
-AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
-if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
-	AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
-fi
-if test -n "$PKG_CONFIG"; then
-	_pkg_min_version=m4_default([$1], [0.9.0])
-	AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
-	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
-		AC_MSG_RESULT([yes])
-	else
-		AC_MSG_RESULT([no])
-		PKG_CONFIG=""
-	fi
-
-fi[]dnl
-])# PKG_PROG_PKG_CONFIG
-
-# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
-#
-# Check to see whether a particular set of modules exists.  Similar
-# to PKG_CHECK_MODULES(), but does not set variables or print errors.
-#
-#
-# Similar to PKG_CHECK_MODULES, make sure that the first instance of
-# this or PKG_CHECK_MODULES is called, or make sure to call
-# PKG_CHECK_EXISTS manually
-# --------------------------------------------------------------
-AC_DEFUN([PKG_CHECK_EXISTS],
-[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
-if test -n "$PKG_CONFIG" && \
-    AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
-  m4_ifval([$2], [$2], [:])
-m4_ifvaln([$3], [else
-  $3])dnl
-fi])
-
-
-# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
-# ---------------------------------------------
-m4_define([_PKG_CONFIG],
-[if test -n "$PKG_CONFIG"; then
-    if test -n "$$1"; then
-        pkg_cv_[]$1="$$1"
-    else
-        PKG_CHECK_EXISTS([$3],
-                         [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
-			 [pkg_failed=yes])
-    fi
-else
-	pkg_failed=untried
-fi[]dnl
-])# _PKG_CONFIG
-
-# _PKG_SHORT_ERRORS_SUPPORTED
-# -----------------------------
-AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
-[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
-if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
-        _pkg_short_errors_supported=yes
-else
-        _pkg_short_errors_supported=no
-fi[]dnl
-])# _PKG_SHORT_ERRORS_SUPPORTED
-
-
-# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
-# [ACTION-IF-NOT-FOUND])
-#
-#
-# Note that if there is a possibility the first call to
-# PKG_CHECK_MODULES might not happen, you should be sure to include an
-# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
-#
-#
-# --------------------------------------------------------------
-AC_DEFUN([PKG_CHECK_MODULES],
-[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
-AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
-AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
-
-pkg_failed=no
-AC_MSG_CHECKING([for $1])
-
-_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
-_PKG_CONFIG([$1][_LIBS], [libs], [$2])
-
-m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
-and $1[]_LIBS to avoid the need to call pkg-config.
-See the pkg-config man page for more details.])
-
-if test $pkg_failed = yes; then
-        _PKG_SHORT_ERRORS_SUPPORTED
-        if test $_pkg_short_errors_supported = yes; then
-	        $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
-        else
-	        $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
-        fi
-	# Put the nasty error message in config.log where it belongs
-	echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
-
-	ifelse([$4], , [AC_MSG_ERROR(dnl
-[Package requirements ($2) were not met:
-
-$$1_PKG_ERRORS
-
-Consider adjusting the PKG_CONFIG_PATH environment variable if you
-installed software in a non-standard prefix.
-
-_PKG_TEXT
-])],
-		[AC_MSG_RESULT([no])
-                $4])
-elif test $pkg_failed = untried; then
-	ifelse([$4], , [AC_MSG_FAILURE(dnl
-[The pkg-config script could not be found or is too old.  Make sure it
-is in your PATH or set the PKG_CONFIG environment variable to the full
-path to pkg-config.
-
-_PKG_TEXT
-
-To get pkg-config, see <http://pkg-config.freedesktop.org/>.])],
-		[$4])
-else
-	$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
-	$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
-        AC_MSG_RESULT([yes])
-	ifelse([$3], , :, [$3])
-fi[]dnl
-])# PKG_CHECK_MODULES
diff --git a/nestegg/nestegg-uninstalled.pc.in b/nestegg/nestegg-uninstalled.pc.in
deleted file mode 100644
index 19bb680..0000000
--- a/nestegg/nestegg-uninstalled.pc.in
+++ /dev/null
@@ -1,13 +0,0 @@
-# nestegg uninstalled pkg-config file
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: nestegg
-Description: WebM/Matroska demuxer
-Version: @VERSION@
-Conflicts:
-Libs: -L${libdir} -lnestegg
-Cflags: -I${includedir}
diff --git a/nestegg/nestegg.pc.in b/nestegg/nestegg.pc.in
deleted file mode 100644
index 32c09d7..0000000
--- a/nestegg/nestegg.pc.in
+++ /dev/null
@@ -1,13 +0,0 @@
-# nestegg installed pkg-config file
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: nestegg
-Description: WebM/Matroska demuxer
-Version: @VERSION@
-Conflicts:
-Libs: -L${libdir} -lnestegg
-Cflags: -I${includedir}
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 80e87c8..7f9398c 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -24,6 +24,8 @@
 #include "test/encode_test_driver.h"
 namespace libvpx_test {
 
+const int kCodecFactoryParam = 0;
+
 class CodecFactory {
  public:
   CodecFactory() {}
diff --git a/test/cq_test.cc b/test/cq_test.cc
index a2c8291..7da7b80 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -20,7 +20,7 @@
 const int kCQLevelMin = 4;
 const int kCQLevelMax = 63;
 const int kCQLevelStep = 8;
-const int kCQTargetBitrate = 2000;
+const unsigned int kCQTargetBitrate = 2000;
 
 class CQTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<int> {
@@ -66,17 +66,17 @@
     return pow(10.0, avg_psnr / 10.0) / file_size_;
   }
 
-  int file_size() const { return file_size_; }
+  size_t file_size() const { return file_size_; }
   int n_frames() const { return n_frames_; }
 
  private:
   int cq_level_;
-  int file_size_;
+  size_t file_size_;
   double psnr_;
   int n_frames_;
 };
 
-int prev_actual_bitrate = kCQTargetBitrate;
+unsigned int prev_actual_bitrate = kCQTargetBitrate;
 TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -88,7 +88,8 @@
                                      timebase.den, timebase.num, 0, 30);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double cq_psnr_lin = GetLinearPSNROverBitrate();
-  const int cq_actual_bitrate = file_size() * 8 * 30 / (n_frames() * 1000);
+  const unsigned int cq_actual_bitrate =
+      static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
   EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
   EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
   prev_actual_bitrate = cq_actual_bitrate;
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 5b0a548..39c9a5a 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -198,6 +198,7 @@
     last_pts_ = 0;
     bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
     frame_number_ = 0;
+    tot_frame_number_ = 0;
     first_drop_ = 0;
     num_drops_ = 0;
     // For testing up to 3 layers.
@@ -294,11 +295,22 @@
 
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    int layer = SetLayerId(frame_number_, cfg_.ts_number_layers);
-
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_)
+        first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
     // Add to the buffer the bits we'd expect from a constant bitrate server.
     bits_in_buffer_model_ += static_cast<int64_t>(
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
@@ -315,18 +327,10 @@
       bits_total_[i] += frame_size_in_bits;
     }
 
-    // If first drop not set and we have a drop set it to this time.
-    if (!first_drop_ && duration > 1)
-      first_drop_ = last_pts_ + 1;
-
-    // Update the number of frame drops.
-    if (duration > 1) {
-      num_drops_ += static_cast<int>(duration - 1);
-    }
-
     // Update the most recent pts.
     last_pts_ = pkt->data.frame.pts;
     ++frame_number_;
+    ++tot_frame_number_;
   }
 
   virtual void EndPassHook(void) {
@@ -342,7 +346,8 @@
 
   vpx_codec_pts_t last_pts_;
   double timebase_;
-  int frame_number_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
   int64_t bits_total_[3];
   double duration_;
   double effective_datarate_[3];
@@ -376,7 +381,6 @@
   }
 }
 
-#if CONFIG_NON420
 // Check basic rate targeting,
 TEST_P(DatarateTestVP9, BasicRateTargeting444) {
   ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
@@ -405,7 +409,6 @@
         << cfg_.rc_target_bitrate << " "<< effective_datarate_;
   }
 }
-#endif
 
 // Check that (1) the first dropped frame gets earlier and earlier
 // as the drop frame threshold is increased, and (2) that the total number of
@@ -493,10 +496,7 @@
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
-  // TODO(marpan): For now keep frame dropper off. Need to investigate an
-  // issue (rate-mismatch) that occcurs at speed 3 and low bitrate (200k) when
-  // frame dropper is on.
-  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_dropframe_thresh = 1;
   cfg_.rc_min_quantizer = 0;
   cfg_.rc_max_quantizer = 63;
   cfg_.rc_end_usage = VPX_CBR;
@@ -529,8 +529,53 @@
     }
   }
 }
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9, BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+            "for layer: " << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+            "for layer: " << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 50% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 100);
+  }
+}
+
 VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
                           ::testing::Values(::libvpx_test::kOnePassGood),
-                          ::testing::Range(1, 5));
+                          ::testing::Range(2, 5));
 }  // namespace
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 8d115fa..cb5562e 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -512,6 +512,14 @@
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_neon, 0)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a2608ac..013f451 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -248,6 +248,16 @@
         make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
         make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_neon, 0),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_neon, 1)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 44a0fc0..2734a45 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -76,6 +76,15 @@
     return detail ? detail : vpx_codec_error(&decoder_);
   }
 
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+    InitOnce();
+    return vpx_codec_set_frame_buffer_functions(
+        &decoder_, cb_get, cb_release, user_priv);
+  }
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const = 0;
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 30c20e9..4cd9efb 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -16,8 +16,8 @@
 
 namespace {
 
-const int kMaxErrorFrames = 8;
-const int kMaxDroppableFrames = 8;
+const int kMaxErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
 
 class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
@@ -175,6 +175,10 @@
   }
 }
 
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+// Check both isolated and consecutive loss.
 TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -186,14 +190,18 @@
   init_flags_ = VPX_CODEC_USE_PSNR;
 
   libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 30);
+                                     timebase.den, timebase.num, 0, 40);
 
   // Error resilient mode ON.
   cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
 
-  // Set an arbitrary set of error frames same as droppable frames
-  unsigned int num_droppable_frames = 2;
-  unsigned int droppable_frame_list[] = {5, 16};
+  // Set an arbitrary set of error frames same as droppable frames.
+  // In addition to isolated loss/drop, add a long consecutive series
+  // (of size 9) of dropped frames.
+  unsigned int num_droppable_frames = 11;
+  unsigned int droppable_frame_list[] = {5, 16, 22, 23, 24, 25, 26, 27, 28,
+                                         29, 30};
   SetDroppableFrames(num_droppable_frames, droppable_frame_list);
   SetErrorFrames(num_droppable_frames, droppable_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -202,7 +210,7 @@
             << GetMismatchFrames() << "\n";
   EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
 
-  // reset previously set error/droppable frames
+  // Reset previously set of error/droppable frames.
   Reset();
 
 #if 0
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
new file mode 100644
index 0000000..2e7adc1
--- /dev/null
+++ b/test/external_frame_buffer_test.cc
@@ -0,0 +1,466 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const int kVideoNameParam = 1;
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+  ExternalFrameBufferList()
+      : num_buffers_(0),
+        ext_fb_list_(NULL) {}
+
+  virtual ~ExternalFrameBufferList() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] ext_fb_list_[i].data;
+    }
+    delete [] ext_fb_list_;
+  }
+
+  // Creates the list to hold the external buffers. Returns true on success.
+  bool CreateBufferList(int num_buffers) {
+    if (num_buffers < 0)
+      return false;
+
+    num_buffers_ = num_buffers;
+    ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+    EXPECT_TRUE(ext_fb_list_ != NULL);
+    memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+    return true;
+  }
+
+  // Searches the frame buffer list for a free frame buffer. Makes sure
+  // that the frame buffer is at least |min_size| in bytes. Marks that the
+  // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+  // external frame buffer. Returns < 0 on an error.
+  int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = new uint8_t[min_size];
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Test function that will not allocate any data for the frame buffer.
+  // Returns < 0 on an error.
+  int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Marks the external frame buffer that |fb| is pointing too as free.
+  // Returns < 0 on an error.
+  int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer*>(fb->priv);
+    EXPECT_TRUE(ext_fb != NULL);
+    EXPECT_EQ(1, ext_fb->in_use);
+    ext_fb->in_use = 0;
+    return 0;
+  }
+
+  // Checks that the ximage data is contained within the external frame buffer
+  // private data passed back in the ximage.
+  void CheckXImageFrameBuffer(const vpx_image_t *img) {
+    if (img->fb_priv != NULL) {
+      const struct ExternalFrameBuffer *const ext_fb =
+          reinterpret_cast<ExternalFrameBuffer*>(img->fb_priv);
+
+      ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                  img->planes[0] < (ext_fb->data + ext_fb->size));
+    }
+  }
+
+ private:
+  // Returns the index of the first free frame buffer. Returns |num_buffers_|
+  // if there are no free frame buffers.
+  int FindFreeBufferIndex() {
+    int i;
+    // Find a free frame buffer.
+    for (i = 0; i < num_buffers_; ++i) {
+      if (!ext_fb_list_[i].in_use)
+        break;
+    }
+    return i;
+  }
+
+  // Sets |fb| to an external frame buffer. idx is the index into the frame
+  // buffer list.
+  void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) {
+    ASSERT_TRUE(fb != NULL);
+    fb->data = ext_fb_list_[idx].data;
+    fb->size = ext_fb_list_[idx].size;
+    ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+    ext_fb_list_[idx].in_use = 1;
+    fb->priv = &ext_fb_list_[idx];
+  }
+
+  int num_buffers_;
+  ExternalFrameBuffer *ext_fb_list_;
+};
+
+// Callback used by libvpx to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_vp9_frame_buffer(void *user_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libvpx to tell the application that |fb| is not needed
+// anymore.
+int release_vp9_frame_buffer(void *user_priv,
+                             vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_vp9_zero_frame_buffer(void *user_priv, size_t min_size,
+                              vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_vp9_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+                                       vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_vp9_frame_buffer(void *user_priv,
+                                    vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  (void)fb;
+  return 0;
+}
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferMD5Test
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
+        md5_file_(NULL),
+        num_buffers_(0) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libvpx use frame buffers we create.
+      ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+      ASSERT_EQ(VPX_CODEC_OK,
+                decoder->SetFrameBufferFunctions(
+                    GetVp9FrameBuffer, ReleaseVP9FrameBuffer, this));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  // Callback to get a free external frame buffer. Return value < 0 is an
+  // error.
+  static int GetVp9FrameBuffer(void *user_priv, size_t min_size,
+                               vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+  }
+
+  // Callback to release an external frame buffer. Return value < 0 is an
+  // error.
+  static int ReleaseVP9FrameBuffer(void *user_priv,
+                                   vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.ReturnFrameBuffer(fb);
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest()
+      : video_(NULL),
+        decoder_(NULL),
+        num_buffers_(0) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = {0};
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete decoder_;
+    delete video_;
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      int num_buffers,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+      EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+    }
+
+    return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames();
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      CheckDecodedFrames();
+    }
+    return VPX_CODEC_OK;
+  }
+
+ private:
+  void CheckDecodedFrames() {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      fb_list_.CheckXImageFrameBuffer(img);
+    }
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+  const int jitter_buffers = 4;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  set_num_buffers(num_buffers);
+
+#if CONFIG_VP8_DECODER
+  // Tell compiler we are not using kVP8TestVectors.
+  (void)libvpx_test::kVP8TestVectors;
+#endif
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else {
+    video = new libvpx_test::WebMVideoSource(filename);
+  }
+  ASSERT_TRUE(video != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS.
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+  const int jitter_buffers = 8;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS. Most files will
+  // only use 5 frame buffers at one time.
+  const int num_buffers = 2;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NoRelease) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    do_not_release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_zero_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_one_less_byte_frame_buffer,
+                release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, NULL,
+                                    release_vp9_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_ERROR,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+}
+
+VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
+                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors));
+}  // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index dc66687..127775c 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -286,6 +286,21 @@
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_neon, 0)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_NEON, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 98aabe6..6f2d7d1 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -313,6 +313,20 @@
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_NEON, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 3fbafbd..824a39d 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -94,14 +94,14 @@
   virtual const uint8_t *cxdata() const {
     return end_of_file_ ? NULL : compressed_frame_buf_;
   }
-  virtual const unsigned int frame_size() const { return frame_sz_; }
-  virtual const unsigned int frame_number() const { return frame_; }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
 
  protected:
   std::string file_name_;
   FILE *input_file_;
   uint8_t *compressed_frame_buf_;
-  unsigned int frame_sz_;
+  size_t frame_sz_;
   unsigned int frame_;
   bool end_of_file_;
 };
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 2a32410..8849ce6 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -140,6 +140,30 @@
         make_tuple(&vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vp9_idct32x32_1024_add_c,
+                   &vp9_idct32x32_1_add_neon,
+                   TX_32X32, 1),
+        make_tuple(&vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_10_add_neon,
+                   TX_16X16, 10),
+        make_tuple(&vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_1_add_neon,
+                   TX_16X16, 1),
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_10_add_neon,
+                   TX_8X8, 10),
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_1_add_neon,
+                   TX_8X8, 1),
+        make_tuple(&vp9_idct4x4_16_add_c,
+                   &vp9_idct4x4_1_add_neon,
+                   TX_4X4, 1)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4a91b0b..401fa1d 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -296,6 +296,8 @@
 
 using std::tr1::make_tuple;
 
+//------------------------------------------------------------------------------
+// C functions
 #if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
 const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
@@ -364,16 +366,20 @@
                         make_tuple(8, 4, sad_8x4x4d_c),
                         make_tuple(4, 8, sad_4x8x4d_c),
                         make_tuple(4, 4, sad_4x4x4d_c)));
-#endif
+#endif  // CONFIG_VP9_ENCODER
 
-// ARM tests
+//------------------------------------------------------------------------------
+// ARM functions
 #if HAVE_MEDIA
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_armv6)));
-
 #endif
+#endif
+
 #if HAVE_NEON
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon;
 const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon;
 const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon;
@@ -386,8 +392,10 @@
                         make_tuple(8, 8, sad_8x8_neon),
                         make_tuple(4, 4, sad_4x4_neon)));
 #endif
+#endif
 
-// X86 tests
+//------------------------------------------------------------------------------
+// x86 functions
 #if HAVE_MMX
 #if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
@@ -437,9 +445,9 @@
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                         make_tuple(4, 8, sad_4x8x4d_sse),
                         make_tuple(4, 4, sad_4x4x4d_sse)));
-#endif
-#endif
-#endif
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_SSE
 
 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 3434662..1b2f03f 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -193,6 +193,16 @@
         make_tuple(8, 8, sixtap_8x8_c),
         make_tuple(8, 4, sixtap_8x4_c),
         make_tuple(4, 4, sixtap_4x4_c)));
+#if HAVE_NEON
+const sixtap_predict_fn_t sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
+const sixtap_predict_fn_t sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
+const sixtap_predict_fn_t sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_neon),
+        make_tuple(8, 8, sixtap_8x8_neon),
+        make_tuple(8, 4, sixtap_8x4_neon)));
+#endif
 #if HAVE_MMX
 const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
 const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 9e242a2..3efb955 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -105,6 +105,11 @@
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_c));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
+                        ::testing::Values(vp8_subtract_b_neon));
+#endif
+
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_mmx));
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 75659d5..2e56534 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -177,20 +177,48 @@
   codec_initialized_ = true;
 }
 
-TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
+TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
   svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40");
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
+                                       "quantizers-keyframe=not-quantizers");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_quantizers(&svc_, "40", 0);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetKeyFrameQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
@@ -221,7 +249,7 @@
 TEST_F(SvcTest, FirstFrameHasLayers) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -248,7 +276,7 @@
 TEST_F(SvcTest, EncodeThreeFrames) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -301,7 +329,7 @@
 TEST_F(SvcTest, GetLayerResolution) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index a287731..6f718ef 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -576,3 +576,15 @@
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
 510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
 14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
+b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
+b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
+8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
+19e003804ec1dfc5464813b32339a15d5ba7b42f  vp90-2-11-size-351x288.webm.md5
+40cd1d6a188d7a88b21ebac1e573d3f270ab261e  vp90-2-11-size-352x287.webm
+68f515abe3858fc1eded46c8e6b2f727d43b5331  vp90-2-11-size-352x287.webm.md5
+9a510769ff23db410880ec3029d433e87d17f7fc  vp90-2-12-droppable_1.ivf
+952eaac6eefa6f62179ed1db3e922fd42fecc624  vp90-2-12-droppable_1.ivf.md5
+9a510769ff23db410880ec3029d433e87d17f7fc  vp90-2-12-droppable_2.ivf
+92a756469fa438220524e7fa6ac1d38c89514d17  vp90-2-12-droppable_2.ivf.md5
+c21e97e4ba486520118d78b01a5cb6e6dc33e190  vp90-2-12-droppable_3.ivf
+601abc9e4176c70f82ac0381365e9b151fdd24cd  vp90-2-12-droppable_3.ivf.md5
diff --git a/test/test.mk b/test/test.mk
index a65decf..bf6d055 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,53 +1,56 @@
+LIBVPX_TEST_SRCS-yes += acm_random.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
+LIBVPX_TEST_SRCS-yes += codec_factory.h
+LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
-LIBVPX_TEST_SRCS-yes += acm_random.h
-LIBVPX_TEST_SRCS-yes += md5_helper.h
-LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.h
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
-LIBVPX_TEST_SRCS-yes += test_vectors.h
-LIBVPX_TEST_SRCS-yes += test_vectors.cc
 
 ##
 ## BLACK BOX TESTS
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc
-
-LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
-LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 
 ## WebM Parsing
-NESTEGG_SRCS                           += ../nestegg/halloc/halloc.h
-NESTEGG_SRCS                           += ../nestegg/halloc/src/align.h
-NESTEGG_SRCS                           += ../nestegg/halloc/src/halloc.c
-NESTEGG_SRCS                           += ../nestegg/halloc/src/hlist.h
-NESTEGG_SRCS                           += ../nestegg/include/nestegg/nestegg.h
-NESTEGG_SRCS                           += ../nestegg/src/nestegg.c
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/halloc.h
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/align.h
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/halloc.c
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/hlist.h
+NESTEGG_SRCS                           += ../third_party/nestegg/include/nestegg/nestegg.h
+NESTEGG_SRCS                           += ../third_party/nestegg/src/nestegg.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(NESTEGG_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
+
 # Currently we only support decoder perf tests for vp9
 ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER), yesyes)
 LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
@@ -69,17 +72,16 @@
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 endif
 
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
+
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 
 endif # VP8
 
@@ -88,29 +90,30 @@
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
-LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
-
 # IDCT test currently depends on FDCT function
 LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
 LIBVPX_TEST_SRCS-yes                   += partial_idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
+
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
-
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 endif # VP9
 
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 
-endif
+endif # CONFIG_SHARED
 
 
 ##
@@ -121,128 +124,128 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
@@ -516,6 +519,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
@@ -644,40 +649,50 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 3227f52..aba8a3c 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -150,20 +150,18 @@
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
   "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
-  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
-  "vp90-2-07-frame_parallel.webm",
-  "vp90-2-08-tile_1x2_frame_parallel.webm", "vp90-2-08-tile_1x2.webm",
-  "vp90-2-08-tile_1x4_frame_parallel.webm", "vp90-2-08-tile_1x4.webm",
-  "vp90-2-08-tile_1x8_frame_parallel.webm", "vp90-2-08-tile_1x8.webm",
-  "vp90-2-08-tile-4x4.webm", "vp90-2-08-tile-4x1.webm",
-  "vp90-2-09-subpixel-00.ivf",
-  "vp90-2-02-size-lf-1920x1080.webm",
-  "vp90-2-09-aq2.webm",
-  "vp90-2-09-lf_deltas.webm",
-  "vp90-2-10-show-existing-frame.webm",
-#if CONFIG_NON420
+  "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
+  "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
+  "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
+  "vp90-2-08-tile_1x4.webm", "vp90-2-08-tile_1x8_frame_parallel.webm",
+  "vp90-2-08-tile_1x8.webm", "vp90-2-08-tile-4x4.webm",
+  "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
+  "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
+  "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
+  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
+  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
   "vp91-2-04-yv444.webm"
-#endif
 };
 #endif  // CONFIG_VP9_DECODER
 
diff --git a/test/test_vectors.h b/test/test_vectors.h
index eb592de..d5ecc96 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -21,11 +21,7 @@
 #endif
 
 #if CONFIG_VP9_DECODER
-#if CONFIG_NON420
-const int kNumVp9TestVectors = 217;
-#else
-const int kNumVp9TestVectors = 216;
-#endif
+const int kNumVp9TestVectors = 223;
 
 extern const char *kVP9TestVectors[kNumVp9TestVectors];
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/variance_test.cc b/test/variance_test.cc
index b9144ff..c9bf13a 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -307,6 +307,19 @@
                       make_tuple(4, 3, variance16x8_c),
                       make_tuple(4, 4, variance16x16_c)));
 
+#if HAVE_NEON
+const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon;
+const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon;
+const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon;
+const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP8VarianceTest,
+    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
+                      make_tuple(3, 4, variance8x16_neon),
+                      make_tuple(4, 3, variance16x8_neon),
+                      make_tuple(4, 4, variance16x16_neon)));
+#endif
+
 #if HAVE_MMX
 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
 const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
diff --git a/test/video_source.h b/test/video_source.h
index 3d01d39..6d1855a 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -184,9 +184,9 @@
 
   virtual const uint8_t *cxdata() const = 0;
 
-  virtual const unsigned int frame_size() const = 0;
+  virtual size_t frame_size() const = 0;
 
-  virtual const unsigned int frame_number() const = 0;
+  virtual unsigned int frame_number() const = 0;
 };
 
 }  // namespace libvpx_test
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 2282687..ad7ba44 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -73,7 +73,6 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-#if CONFIG_NON420
 TEST_P(LossLessTest, TestLossLessEncoding444) {
   libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
 
@@ -90,7 +89,6 @@
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
-#endif
 
 VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
 }  // namespace
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 53b0ba2..9d88ae3 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -14,7 +14,7 @@
 #include <cstdlib>
 #include <new>
 #include <string>
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 #include "test/video_source.h"
 
 namespace libvpx_test {
@@ -108,7 +108,7 @@
 
     nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
                      input_file_};
-    ASSERT_FALSE(nestegg_init(&nestegg_ctx_, io, NULL))
+    ASSERT_FALSE(nestegg_init(&nestegg_ctx_, io, NULL, -1))
         << "nestegg_init failed";
 
     unsigned int n;
@@ -169,8 +169,8 @@
   virtual const uint8_t *cxdata() const {
     return end_of_file_ ? NULL : buf_;
   }
-  virtual const unsigned int frame_size() const { return buf_sz_; }
-  virtual const unsigned int frame_number() const { return frame_; }
+  virtual size_t frame_size() const { return buf_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 20d2be0..7419043 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -35,14 +35,11 @@
 
   virtual ~Y4mVideoSource() {
     vpx_img_free(img_.get());
-    y4m_input_close(&y4m_);
-    if (input_file_)
-      fclose(input_file_);
+    CloseSource();
   }
 
   virtual void Begin() {
-    if (input_file_)
-      fclose(input_file_);
+    CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
@@ -89,6 +86,15 @@
   }
 
  protected:
+  void CloseSource() {
+    y4m_input_close(&y4m_);
+    y4m_ = y4m_input();
+    if (input_file_ != NULL) {
+      fclose(input_file_);
+      input_file_ = NULL;
+    }
+  }
+
   std::string file_name_;
   FILE *input_file_;
   testing::internal::scoped_ptr<vpx_image_t> img_;
diff --git a/third_party/nestegg/0001-include-paths.diff b/third_party/nestegg/0001-include-paths.diff
new file mode 100644
index 0000000..a704ebd
--- /dev/null
+++ b/third_party/nestegg/0001-include-paths.diff
@@ -0,0 +1,41 @@
+diff --git a/nestegg/halloc/src/halloc.c b/nestegg/halloc/src/halloc.c
+index 5758fc0..837b3ff 100644
+--- a/nestegg/halloc/src/halloc.c
++++ b/nestegg/halloc/src/halloc.c
+@@ -15,7 +15,7 @@
+ #include <stdlib.h>  /* realloc */
+ #include <string.h>  /* memset & co */
+ 
+-#include "halloc.h"
++#include "third_party/nestegg/halloc/halloc.h"
+ #include "align.h"
+ #include "hlist.h"
+ 
+diff --git a/nestegg/include/nestegg/nestegg.h b/nestegg/include/nestegg/nestegg.h
+index ff13728..c18d1d3 100644
+--- a/nestegg/include/nestegg/nestegg.h
++++ b/nestegg/include/nestegg/nestegg.h
+@@ -7,7 +7,7 @@
+ #if !defined(NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79)
+ #define NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
+ 
+-#include <nestegg/nestegg-stdint.h>
++#include "vpx/vpx_integer.h"
+ 
+ #if defined(__cplusplus)
+ extern "C" {
+diff --git a/nestegg/src/nestegg.c b/nestegg/src/nestegg.c
+index daf1eed..4fb10e7 100644
+--- a/nestegg/src/nestegg.c
++++ b/nestegg/src/nestegg.c
+@@ -8,8 +8,8 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#include "halloc.h"
+-#include "nestegg/nestegg.h"
++#include "third_party/nestegg/halloc/halloc.h"
++#include "third_party/nestegg/include/nestegg/nestegg.h"
+ 
+ /* EBML Elements */
+ #define ID_EBML                 0x1a45dfa3
diff --git a/third_party/nestegg/0002-ne_read_simple-uninitialized_variable.diff b/third_party/nestegg/0002-ne_read_simple-uninitialized_variable.diff
new file mode 100644
index 0000000..c3bc9e5
--- /dev/null
+++ b/third_party/nestegg/0002-ne_read_simple-uninitialized_variable.diff
@@ -0,0 +1,21 @@
+diff --git a/nestegg/src/nestegg.c b/nestegg/src/nestegg.c
+index 4fb10e7..b6bc460 100644
+--- a/nestegg/src/nestegg.c
++++ b/nestegg/src/nestegg.c
+@@ -934,7 +934,7 @@ static int
+ ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
+ {
+   struct ebml_type * storage;
+-  int r;
++  int r = 0;
+ 
+   storage = (struct ebml_type *) (ctx->ancestor->data + desc->offset);
+ 
+@@ -968,7 +968,6 @@ ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
+   case TYPE_MASTER:
+   case TYPE_UNKNOWN:
+     assert(0);
+-    r = 0;
+     break;
+   }
+ 
diff --git a/third_party/nestegg/AUTHORS b/third_party/nestegg/AUTHORS
new file mode 100644
index 0000000..7d2c612
--- /dev/null
+++ b/third_party/nestegg/AUTHORS
@@ -0,0 +1,3 @@
+Matthew Gregan <kinetik@flim.org>
+Steve Workman <sjhworkman@gmail.com>
+Paul Adenot <paul@paul.cx>
diff --git a/nestegg/INSTALL b/third_party/nestegg/INSTALL
similarity index 100%
rename from nestegg/INSTALL
rename to third_party/nestegg/INSTALL
diff --git a/nestegg/LICENSE b/third_party/nestegg/LICENSE
similarity index 100%
rename from nestegg/LICENSE
rename to third_party/nestegg/LICENSE
diff --git a/nestegg/README b/third_party/nestegg/README
similarity index 100%
rename from nestegg/README
rename to third_party/nestegg/README
diff --git a/third_party/nestegg/README.webm b/third_party/nestegg/README.webm
new file mode 100644
index 0000000..c931168
--- /dev/null
+++ b/third_party/nestegg/README.webm
@@ -0,0 +1,16 @@
+URL: https://github.com/kinetiknz/nestegg.git
+Version: f46223191d8116a36bf299b5b9793fcb798417b1
+License: ISC-style
+License File: LICENSE
+
+Description:
+The source under the halloc/ directory is licensed under a BSD license. See
+halloc/halloc.h for details.
+
+Local Modifications:
+- delete unnecessary docs and build files
+- nestegg/0001-include-paths.diff
+  include path modifications for the libvpx build system
+- 0002-ne_read_simple-uninitialized_variable.diff
+  fixes:
+nestegg.c|975 col 6| warning: ‘r’ may be used uninitialized in this function [-Wuninitialized]
diff --git a/nestegg/TODO b/third_party/nestegg/TODO
similarity index 100%
rename from nestegg/TODO
rename to third_party/nestegg/TODO
diff --git a/nestegg/halloc/README b/third_party/nestegg/halloc/README
similarity index 100%
rename from nestegg/halloc/README
rename to third_party/nestegg/halloc/README
diff --git a/nestegg/halloc/halloc.h b/third_party/nestegg/halloc/halloc.h
similarity index 100%
rename from nestegg/halloc/halloc.h
rename to third_party/nestegg/halloc/halloc.h
diff --git a/nestegg/halloc/src/align.h b/third_party/nestegg/halloc/src/align.h
similarity index 100%
rename from nestegg/halloc/src/align.h
rename to third_party/nestegg/halloc/src/align.h
diff --git a/nestegg/halloc/src/halloc.c b/third_party/nestegg/halloc/src/halloc.c
similarity index 98%
rename from nestegg/halloc/src/halloc.c
rename to third_party/nestegg/halloc/src/halloc.c
index 38fd6c1..8860d73 100644
--- a/nestegg/halloc/src/halloc.c
+++ b/third_party/nestegg/halloc/src/halloc.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>  /* realloc */
 #include <string.h>  /* memset & co */
 
-#include "../halloc.h"
+#include "third_party/nestegg/halloc/halloc.h"
 #include "align.h"
 #include "hlist.h"
 
diff --git a/nestegg/halloc/src/hlist.h b/third_party/nestegg/halloc/src/hlist.h
similarity index 100%
rename from nestegg/halloc/src/hlist.h
rename to third_party/nestegg/halloc/src/hlist.h
diff --git a/nestegg/halloc/src/macros.h b/third_party/nestegg/halloc/src/macros.h
similarity index 100%
rename from nestegg/halloc/src/macros.h
rename to third_party/nestegg/halloc/src/macros.h
diff --git a/nestegg/include/nestegg/nestegg.h b/third_party/nestegg/include/nestegg/nestegg.h
similarity index 76%
rename from nestegg/include/nestegg/nestegg.h
rename to third_party/nestegg/include/nestegg/nestegg.h
index 6510694..c18d1d3 100644
--- a/nestegg/include/nestegg/nestegg.h
+++ b/third_party/nestegg/include/nestegg/nestegg.h
@@ -4,12 +4,12 @@
  * This program is made available under an ISC-style license.  See the
  * accompanying file LICENSE for details.
  */
-#ifndef   NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
-#define   NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
+#if !defined(NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79)
+#define NESTEGG_671cac2a_365d_ed69_d7a3_4491d3538d79
 
 #include "vpx/vpx_integer.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus)
 extern "C" {
 #endif
 
@@ -17,10 +17,10 @@
 
     @section intro Introduction
 
-    This is the documentation fot the <tt>libnestegg</tt> C API.
+    This is the documentation for the <tt>libnestegg</tt> C API.
     <tt>libnestegg</tt> is a demultiplexing library for <a
-    href="http://www.matroska.org/">Matroska</a> and <a
-    href="http://www.webmproject.org/">WebMedia</a> media files.
+    href="http://www.webmproject.org/code/specs/container/">WebM</a>
+    media files.
 
     @section example Example code
 
@@ -68,6 +68,13 @@
 #define NESTEGG_CODEC_VP8    0 /**< Track uses Google On2 VP8 codec. */
 #define NESTEGG_CODEC_VORBIS 1 /**< Track uses Xiph Vorbis codec. */
 #define NESTEGG_CODEC_VP9    2 /**< Track uses Google On2 VP9 codec. */
+#define NESTEGG_CODEC_OPUS   3 /**< Track uses Xiph Opus codec. */
+
+#define NESTEGG_VIDEO_MONO              0 /**< Track is mono video. */
+#define NESTEGG_VIDEO_STEREO_LEFT_RIGHT 1 /**< Track is side-by-side stereo video.  Left first. */
+#define NESTEGG_VIDEO_STEREO_BOTTOM_TOP 2 /**< Track is top-bottom stereo video.  Right first. */
+#define NESTEGG_VIDEO_STEREO_TOP_BOTTOM 3 /**< Track is top-bottom stereo video.  Left first. */
+#define NESTEGG_VIDEO_STEREO_RIGHT_LEFT 11 /**< Track is side-by-side stereo video.  Right first. */
 
 #define NESTEGG_SEEK_SET 0 /**< Seek offset relative to beginning of stream. */
 #define NESTEGG_SEEK_CUR 1 /**< Seek offset relative to current position in stream. */
@@ -114,6 +121,10 @@
 
 /** Parameters specific to a video track. */
 typedef struct {
+  unsigned int stereo_mode;    /**< Video mode.  One of #NESTEGG_VIDEO_MONO,
+                                    #NESTEGG_VIDEO_STEREO_LEFT_RIGHT,
+                                    #NESTEGG_VIDEO_STEREO_BOTTOM_TOP, or
+                                    #NESTEGG_VIDEO_STEREO_TOP_BOTTOM. */
   unsigned int width;          /**< Width of the video frame in pixels. */
   unsigned int height;         /**< Height of the video frame in pixels. */
   unsigned int display_width;  /**< Display width of the video frame in pixels. */
@@ -129,6 +140,8 @@
   double rate;           /**< Sampling rate in Hz. */
   unsigned int channels; /**< Number of audio channels. */
   unsigned int depth;    /**< Bits per sample. */
+  uint64_t  codec_delay; /**< Nanoseconds that must be discarded from the start. */
+  uint64_t  seek_preroll;/**< Nanoseconds that must be discarded after a seek. */
 } nestegg_audio_params;
 
 /** Logging callback function pointer. */
@@ -140,9 +153,10 @@
     @param context  Storage for the new nestegg context.  @see nestegg_destroy
     @param io       User supplied IO context.
     @param callback Optional logging callback function pointer.  May be NULL.
+    @param max_offset Optional maximum offset to be read. Set -1 to ignore.
     @retval  0 Success.
     @retval -1 Error. */
-int nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback);
+int nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback, int64_t max_offset);
 
 /** Destroy a nestegg context and free associated memory.
     @param context #nestegg context to be freed.  @see nestegg_init */
@@ -171,6 +185,29 @@
     @retval -1 Error. */
 int nestegg_track_count(nestegg * context, unsigned int * tracks);
 
+/** Query the start and end offset for a particular cluster.
+    @param context     Stream context initialized by #nestegg_init.
+    @param cluster_num Zero-based cluster number; order they appear in cues.
+    @param max_offset  Optional maximum offset to be read. Set -1 to ignore.
+    @param start_pos   Starting offset of the cluster. -1 means non-existant.
+    @param end_pos     Starting offset of the cluster. -1 means non-existant or
+                       final cluster.
+    @param tstamp      Starting timestamp of the cluster.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_get_cue_point(nestegg * context, unsigned int cluster_num,
+                          int64_t max_offset, int64_t * start_pos,
+                          int64_t * end_pos, uint64_t * tstamp);
+
+/** Seek to @a offset.  Stream will seek directly to offset.
+    Should be used to seek to the start of a resync point, i.e. cluster; the
+    parser will not be able to understand other offsets.
+    @param context Stream context initialized by #nestegg_init.
+    @param offset  Absolute offset in bytes.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_offset_seek(nestegg * context, uint64_t offset);
+
 /** Seek @a track to @a tstamp.  Stream seek will terminate at the earliest
     key point in the stream at or before @a tstamp.  Other tracks in the
     stream will output packets with unspecified but nearby timestamps.
@@ -286,7 +323,30 @@
 int nestegg_packet_data(nestegg_packet * packet, unsigned int item,
                         unsigned char ** data, size_t * length);
 
-#ifdef __cplusplus
+/** Returns discard_padding for given packet
+    @param packet  Packet initialized by #nestegg_read_packet.
+    @param discard_padding pointer to store discard padding in.
+    @retval  0 Success.
+    @retval -1 Error. */
+int nestegg_packet_discard_padding(nestegg_packet * packet,
+                                   int64_t * discard_padding);
+
+/** Query the presence of cues.
+    @param context  Stream context initialized by #nestegg_init.
+    @retval 0 The media has no cues.
+    @retval 1 The media has cues. */
+int nestegg_has_cues(nestegg * context);
+
+/**
+ * Try to determine if the buffer looks like the beginning of a WebM file.
+ *
+ * @param buffer A buffer containing the beginning of a media file.
+ * @param length The size of the buffer.
+ * @retval 0 The file is not a WebM file.
+ * @retval 1 The file is a WebM file. */
+int nestegg_sniff(unsigned char const * buffer, size_t length);
+
+#if defined(__cplusplus)
 }
 #endif
 
diff --git a/nestegg/src/nestegg.c b/third_party/nestegg/src/nestegg.c
similarity index 77%
rename from nestegg/src/nestegg.c
rename to third_party/nestegg/src/nestegg.c
index ae87e8f..30e0e2b 100644
--- a/nestegg/src/nestegg.c
+++ b/third_party/nestegg/src/nestegg.c
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "nestegg/halloc/halloc.h"
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/halloc/halloc.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 
 /* EBML Elements */
 #define ID_EBML                 0x1a45dfa3
@@ -25,7 +25,7 @@
 #define ID_VOID                 0xec
 #define ID_CRC32                0xbf
 
-/* WebMedia Elements */
+/* WebM Elements */
 #define ID_SEGMENT              0x18538067
 
 /* Seek Head Elements */
@@ -49,6 +49,7 @@
 #define ID_BLOCK                0xa1
 #define ID_BLOCK_DURATION       0x9b
 #define ID_REFERENCE_BLOCK      0xfb
+#define ID_DISCARD_PADDING      0x75a2
 
 /* Tracks Elements */
 #define ID_TRACKS               0x1654ae6b
@@ -63,9 +64,12 @@
 #define ID_LANGUAGE             0x22b59c
 #define ID_CODEC_ID             0x86
 #define ID_CODEC_PRIVATE        0x63a2
+#define ID_CODEC_DELAY          0x56aa
+#define ID_SEEK_PREROLL         0x56bb
 
 /* Video Elements */
 #define ID_VIDEO                0xe0
+#define ID_STEREO_MODE          0x53b8
 #define ID_PIXEL_WIDTH          0xb0
 #define ID_PIXEL_HEIGHT         0xba
 #define ID_PIXEL_CROP_BOTTOM    0x54aa
@@ -129,6 +133,7 @@
 #define TRACK_ID_VP8            "V_VP8"
 #define TRACK_ID_VP9            "V_VP9"
 #define TRACK_ID_VORBIS         "A_VORBIS"
+#define TRACK_ID_OPUS           "A_OPUS"
 
 enum vint_mask {
   MASK_NONE,
@@ -192,6 +197,7 @@
 struct block_group {
   struct ebml_type duration;
   struct ebml_type reference_block;
+  struct ebml_type discard_padding;
 };
 
 struct cluster {
@@ -200,6 +206,7 @@
 };
 
 struct video {
+  struct ebml_type stereo_mode;
   struct ebml_type pixel_width;
   struct ebml_type pixel_height;
   struct ebml_type pixel_crop_bottom;
@@ -227,6 +234,8 @@
   struct ebml_type language;
   struct ebml_type codec_id;
   struct ebml_type codec_private;
+  struct ebml_type codec_delay;
+  struct ebml_type seek_preroll;
   struct video video;
   struct audio audio;
 };
@@ -274,6 +283,7 @@
   struct list_node * ancestor;
   uint64_t last_id;
   uint64_t last_size;
+  int last_valid;
 };
 
 struct frame {
@@ -289,6 +299,7 @@
   struct pool_ctx * alloc_pool;
   uint64_t last_id;
   uint64_t last_size;
+  int last_valid;
   struct list_node * ancestor;
   struct ebml ebml;
   struct segment segment;
@@ -300,6 +311,7 @@
   uint64_t track;
   uint64_t timecode;
   struct frame * frame;
+  int64_t discard_padding;
 };
 
 /* Element Descriptor */
@@ -341,7 +353,7 @@
   E_LAST
 };
 
-/* WebMedia Element Lists */
+/* WebM Element Lists */
 static struct ebml_element_desc ne_seek_elements[] = {
   E_FIELD(ID_SEEK_ID, TYPE_BINARY, struct seek, id),
   E_FIELD(ID_SEEK_POSITION, TYPE_UINT, struct seek, position),
@@ -363,6 +375,7 @@
   E_SUSPEND(ID_BLOCK, TYPE_BINARY),
   E_FIELD(ID_BLOCK_DURATION, TYPE_UINT, struct block_group, duration),
   E_FIELD(ID_REFERENCE_BLOCK, TYPE_INT, struct block_group, reference_block),
+  E_FIELD(ID_DISCARD_PADDING, TYPE_INT, struct block_group, discard_padding),
   E_LAST
 };
 
@@ -374,6 +387,7 @@
 };
 
 static struct ebml_element_desc ne_video_elements[] = {
+  E_FIELD(ID_STEREO_MODE, TYPE_UINT, struct video, stereo_mode),
   E_FIELD(ID_PIXEL_WIDTH, TYPE_UINT, struct video, pixel_width),
   E_FIELD(ID_PIXEL_HEIGHT, TYPE_UINT, struct video, pixel_height),
   E_FIELD(ID_PIXEL_CROP_BOTTOM, TYPE_UINT, struct video, pixel_crop_bottom),
@@ -403,6 +417,8 @@
   E_FIELD(ID_LANGUAGE, TYPE_STRING, struct track_entry, language),
   E_FIELD(ID_CODEC_ID, TYPE_STRING, struct track_entry, codec_id),
   E_FIELD(ID_CODEC_PRIVATE, TYPE_BINARY, struct track_entry, codec_private),
+  E_FIELD(ID_CODEC_DELAY, TYPE_UINT, struct track_entry, codec_delay),
+  E_FIELD(ID_SEEK_PREROLL, TYPE_UINT, struct track_entry, seek_preroll),
   E_SINGLE_MASTER(ID_VIDEO, TYPE_MASTER, struct track_entry, video),
   E_SINGLE_MASTER(ID_AUDIO, TYPE_MASTER, struct track_entry, audio),
   E_LAST
@@ -660,7 +676,7 @@
   } value;
   int r;
 
-  /* length == 10 not implemented */
+  /* Length == 10 not implemented. */
   if (length != 4 && length != 8)
     return -1;
   r = ne_read_uint(io, &value.u, length);
@@ -808,6 +824,7 @@
   s->ancestor = ctx->ancestor;
   s->last_id = ctx->last_id;
   s->last_size = ctx->last_size;
+  s->last_valid = ctx->last_valid;
   return 0;
 }
 
@@ -822,6 +839,7 @@
   ctx->ancestor = s->ancestor;
   ctx->last_id = s->last_id;
   ctx->last_size = s->last_size;
+  ctx->last_valid = s->last_valid;
   return 0;
 }
 
@@ -830,7 +848,7 @@
 {
   int r;
 
-  if (ctx->last_id && ctx->last_size) {
+  if (ctx->last_valid) {
     if (id)
       *id = ctx->last_id;
     if (size)
@@ -851,6 +869,8 @@
   if (size)
     *size = ctx->last_size;
 
+  ctx->last_valid = 1;
+
   return 1;
 }
 
@@ -863,8 +883,7 @@
   if (r != 1)
     return r;
 
-  ctx->last_id = 0;
-  ctx->last_size = 0;
+  ctx->last_valid = 0;
 
   return 1;
 }
@@ -915,7 +934,7 @@
 ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
 {
   struct ebml_type * storage;
-  int r;
+  int r = 0;
 
   storage = (struct ebml_type *) (ctx->ancestor->data + desc->offset);
 
@@ -930,8 +949,6 @@
   ctx->log(ctx, NESTEGG_LOG_DEBUG, "element %llx (%s) -> %p (%u)",
            desc->id, desc->name, storage, desc->offset);
 
-  r = -1;
-
   switch (desc->type) {
   case TYPE_UINT:
     r = ne_read_uint(ctx->io, &storage->v.u, length);
@@ -961,29 +978,26 @@
 }
 
 static int
-ne_parse(nestegg * ctx, struct ebml_element_desc * top_level)
+ne_parse(nestegg * ctx, struct ebml_element_desc * top_level, int64_t max_offset)
 {
   int r;
   int64_t * data_offset;
-  uint64_t id, size;
+  uint64_t id, size, peeked_id;
   struct ebml_element_desc * element;
 
-  /* loop until we need to return:
-     - hit suspend point
-     - parse complete
-     - error occurred */
-
-  /* loop over elements at current level reading them if sublevel found,
-     push ctx onto stack and continue if sublevel ended, pop ctx off stack
-     and continue */
-
   if (!ctx->ancestor)
     return -1;
 
   for (;;) {
+    if (max_offset > 0 && ne_io_tell(ctx->io) >= max_offset) {
+      /* Reached end of offset allowed for parsing - return gracefully */
+      r = 1;
+      break;
+    }
     r = ne_peek_element(ctx, &id, &size);
     if (r != 1)
       break;
+    peeked_id = id;
 
     element = ne_find_element(id, ctx->ancestor->node);
     if (element) {
@@ -997,6 +1011,7 @@
       r = ne_read_element(ctx, &id, &size);
       if (r != 1)
         break;
+      assert(id == peeked_id);
 
       if (element->flags & DESC_FLAG_OFFSET) {
         data_offset = (int64_t *) (ctx->ancestor->data + element->data_offset);
@@ -1106,7 +1121,7 @@
   if (*read + sum > block)
     return -1;
 
-  /* last frame is the remainder of the block */
+  /* Last frame is the remainder of the block. */
   sizes[i] = block - *read - sum;
   return 1;
 }
@@ -1143,7 +1158,7 @@
   if (*read + sum > block)
     return -1;
 
-  /* last frame is the remainder of the block */
+  /* Last frame is the remainder of the block. */
   sizes[i] = block - *read - sum;
   return 1;
 }
@@ -1159,6 +1174,37 @@
   return scale;
 }
 
+static int
+ne_map_track_number_to_index(nestegg * ctx,
+                             unsigned int track_number,
+                             unsigned int * track_index)
+{
+  struct ebml_list_node * node;
+  struct track_entry * t_entry;
+  uint64_t t_number = 0;
+
+  if (!track_index)
+    return -1;
+  *track_index = 0;
+
+  if (track_number == 0)
+    return -1;
+
+  node = ctx->segment.tracks.track_entry.head;
+  while (node) {
+    assert(node->id == ID_TRACK_ENTRY);
+    t_entry = node->data;
+    if (ne_get_uint(t_entry->number, &t_number) != 0)
+      return -1;
+    if (t_number == track_number)
+      return 0;
+    *track_index += 1;
+    node = node->next;
+  }
+
+  return -1;
+}
+
 static struct track_entry *
 ne_find_track_entry(nestegg * ctx, unsigned int track)
 {
@@ -1187,8 +1233,8 @@
   struct frame * f, * last;
   struct track_entry * entry;
   double track_scale;
-  uint64_t track, length, frame_sizes[256], cluster_tc, flags, frames, tc_scale, total;
-  unsigned int i, lacing;
+  uint64_t track_number, length, frame_sizes[256], cluster_tc, flags, frames, tc_scale, total;
+  unsigned int i, lacing, track;
   size_t consumed = 0;
 
   *data = NULL;
@@ -1196,11 +1242,11 @@
   if (block_size > LIMIT_BLOCK)
     return -1;
 
-  r = ne_read_vint(ctx->io, &track, &length);
+  r = ne_read_vint(ctx->io, &track_number, &length);
   if (r != 1)
     return r;
 
-  if (track == 0 || track > ctx->track_count)
+  if (track_number == 0)
     return -1;
 
   consumed += length;
@@ -1219,8 +1265,8 @@
 
   frames = 0;
 
-  /* flags are different between block and simpleblock, but lacing is
-     encoded the same way */
+  /* Flags are different between Block and SimpleBlock, but lacing is
+     encoded the same way. */
   lacing = (flags & BLOCK_FLAGS_LACING) >> 1;
 
   switch (lacing) {
@@ -1266,14 +1312,17 @@
     break;
   }
 
-  /* sanity check unlaced frame sizes against total block size. */
+  /* Sanity check unlaced frame sizes against total block size. */
   total = consumed;
   for (i = 0; i < frames; ++i)
     total += frame_sizes[i];
   if (total > block_size)
     return -1;
 
-  entry = ne_find_track_entry(ctx, (unsigned int)(track - 1));
+  if (ne_map_track_number_to_index(ctx, track_number, &track) != 0)
+    return -1;
+
+  entry = ne_find_track_entry(ctx, track);
   if (!entry)
     return -1;
 
@@ -1291,8 +1340,8 @@
     return -1;
 
   pkt = ne_alloc(sizeof(*pkt));
-  pkt->track = track - 1;
-  pkt->timecode = (uint64_t)(abs_timecode * tc_scale * track_scale);
+  pkt->track = track;
+  pkt->timecode = abs_timecode * tc_scale * track_scale;
 
   ctx->log(ctx, NESTEGG_LOG_DEBUG, "%sblock t %lld pts %f f %llx frames: %llu",
            block_id == ID_BLOCK ? "" : "simple", pkt->track, pkt->timecode / 1e9, flags, frames);
@@ -1326,6 +1375,35 @@
   return 1;
 }
 
+static int
+ne_read_discard_padding(nestegg * ctx, nestegg_packet * pkt)
+{
+  int r;
+  uint64_t id, size;
+  struct ebml_element_desc * element;
+  struct ebml_type * storage;
+
+  r = ne_peek_element(ctx, &id, &size);
+  if (r != 1)
+    return r;
+
+  if (id != ID_DISCARD_PADDING)
+    return 1;
+
+  element = ne_find_element(id, ctx->ancestor->node);
+  if (!element)
+    return 1;
+
+  r = ne_read_simple(ctx, element, size);
+  if (r != 1)
+    return r;
+  storage = (struct ebml_type *) (ctx->ancestor->data + element->offset);
+  pkt->discard_padding = storage->v.i;
+
+  return 1;
+}
+
+
 static uint64_t
 ne_buf_read_id(unsigned char const * p, size_t length)
 {
@@ -1369,8 +1447,33 @@
   return NULL;
 }
 
+static struct cue_track_positions *
+ne_find_cue_position_for_track(nestegg * ctx, struct ebml_list_node * node, unsigned int track)
+{
+  struct cue_track_positions * pos = NULL;
+  uint64_t track_number;
+  unsigned int t;
+
+  while (node) {
+    assert(node->id == ID_CUE_TRACK_POSITIONS);
+    pos = node->data;
+    if (ne_get_uint(pos->track, &track_number) != 0)
+      return NULL;
+
+    if (ne_map_track_number_to_index(ctx, track_number, &t) != 0)
+      return NULL;
+
+    if (t == track)
+      return pos;
+
+    node = node->next;
+  }
+
+  return NULL;
+}
+
 static struct cue_point *
-ne_find_cue_point_for_tstamp(struct ebml_list_node * cue_point, uint64_t scale, uint64_t tstamp)
+ne_find_cue_point_for_tstamp(nestegg * ctx, struct ebml_list_node * cue_point, unsigned int track, uint64_t scale, uint64_t tstamp)
 {
   uint64_t time;
   struct cue_point * c, * prev = NULL;
@@ -1385,7 +1488,9 @@
     if (ne_get_uint(c->time, &time) == 0 && time * scale > tstamp)
       break;
 
-    prev = cue_point->data;
+    if (ne_find_cue_position_for_track(ctx, c->cue_track_positions.head, track) != NULL)
+      prev = c;
+
     cue_point = cue_point->next;
   }
 
@@ -1395,7 +1500,6 @@
 static int
 ne_is_suspend_element(uint64_t id)
 {
-  /* this could search the tree of elements for DESC_FLAG_SUSPEND */
   if (id == ID_SIMPLE_BLOCK || id == ID_BLOCK)
     return 1;
   return 0;
@@ -1408,14 +1512,180 @@
     return;
 }
 
+static int
+ne_init_cue_points(nestegg * ctx, int64_t max_offset)
+{
+  int r;
+  struct ebml_list_node * node = ctx->segment.cues.cue_point.head;
+  struct seek * found;
+  uint64_t seek_pos, id;
+  struct saved_state state;
+
+  /* If there are no cues loaded, check for cues element in the seek head
+     and load it. */
+  if (!node) {
+    found = ne_find_seek_for_id(ctx->segment.seek_head.head, ID_CUES);
+    if (!found)
+      return -1;
+
+    if (ne_get_uint(found->position, &seek_pos) != 0)
+      return -1;
+
+    /* Save old parser state. */
+    r = ne_ctx_save(ctx, &state);
+    if (r != 0)
+      return -1;
+
+    /* Seek and set up parser state for segment-level element (Cues). */
+    r = ne_io_seek(ctx->io, ctx->segment_offset + seek_pos, NESTEGG_SEEK_SET);
+    if (r != 0)
+      return -1;
+    ctx->last_valid = 0;
+
+    r = ne_read_element(ctx, &id, NULL);
+    if (r != 1)
+      return -1;
+
+    if (id != ID_CUES)
+      return -1;
+
+    ctx->ancestor = NULL;
+    ne_ctx_push(ctx, ne_top_level_elements, ctx);
+    ne_ctx_push(ctx, ne_segment_elements, &ctx->segment);
+    ne_ctx_push(ctx, ne_cues_elements, &ctx->segment.cues);
+    /* parser will run until end of cues element. */
+    ctx->log(ctx, NESTEGG_LOG_DEBUG, "seek: parsing cue elements");
+    r = ne_parse(ctx, ne_cues_elements, max_offset);
+    while (ctx->ancestor)
+      ne_ctx_pop(ctx);
+
+    /* Reset parser state to original state and seek back to old position. */
+    if (ne_ctx_restore(ctx, &state) != 0)
+      return -1;
+
+    if (r < 0)
+      return -1;
+
+    node = ctx->segment.cues.cue_point.head;
+    if (!node)
+      return -1;
+  }
+
+  return 0;
+}
+
+/* Three functions that implement the nestegg_io interface, operating on a
+ * sniff_buffer. */
+struct sniff_buffer {
+  unsigned char const * buffer;
+  size_t length;
+  int64_t offset;
+};
+
+static int
+ne_buffer_read(void * buffer, size_t length, void * user_data)
+{
+  struct sniff_buffer * sb = user_data;
+
+  int rv = 1;
+  size_t available = sb->length - sb->offset;
+
+  if (available < length)
+    return 0;
+
+  memcpy(buffer, sb->buffer + sb->offset, length);
+  sb->offset += length;
+
+  return rv;
+}
+
+static int
+ne_buffer_seek(int64_t offset, int whence, void * user_data)
+{
+  struct sniff_buffer * sb = user_data;
+  int64_t o = sb->offset;
+
+  switch(whence) {
+    case NESTEGG_SEEK_SET:
+      o = offset;
+      break;
+    case NESTEGG_SEEK_CUR:
+      o += offset;
+      break;
+    case NESTEGG_SEEK_END:
+      o = sb->length + offset;
+      break;
+  }
+
+  if (o < 0 || o > (int64_t) sb->length)
+    return -1;
+
+  sb->offset = o;
+  return 0;
+}
+
+static int64_t
+ne_buffer_tell(void * user_data)
+{
+  struct sniff_buffer * sb = user_data;
+  return sb->offset;
+}
+
+static int
+ne_match_webm(nestegg_io io, int64_t max_offset)
+{
+  int r;
+  uint64_t id;
+  char * doctype;
+  nestegg * ctx;
+
+  if (!(io.read && io.seek && io.tell))
+    return -1;
+
+  ctx = ne_alloc(sizeof(*ctx));
+
+  ctx->io = ne_alloc(sizeof(*ctx->io));
+  *ctx->io = io;
+  ctx->alloc_pool = ne_pool_init();
+  ctx->log = ne_null_log_callback;
+
+  r = ne_peek_element(ctx, &id, NULL);
+  if (r != 1) {
+    nestegg_destroy(ctx);
+    return 0;
+  }
+
+  if (id != ID_EBML) {
+    nestegg_destroy(ctx);
+    return 0;
+  }
+
+  ne_ctx_push(ctx, ne_top_level_elements, ctx);
+
+  /* we don't check the return value of ne_parse, that might fail because
+   * max_offset is not on a valid element end point. We only want to check
+   * the EBML ID and that the doctype is "webm". */
+  ne_parse(ctx, NULL, max_offset);
+
+  if (ne_get_string(ctx->ebml.doctype, &doctype) != 0 ||
+      strcmp(doctype, "webm") != 0) {
+    nestegg_destroy(ctx);
+    return 0;
+  }
+
+  nestegg_destroy(ctx);
+
+  return 1;
+}
+
 int
-nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback)
+nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback, int64_t max_offset)
 {
   int r;
   uint64_t id, version, docversion;
   struct ebml_list_node * track;
   char * doctype;
-  nestegg * ctx = NULL;
+  nestegg * ctx;
 
   if (!(io.read && io.seek && io.tell))
     return -1;
@@ -1445,7 +1715,7 @@
 
   ne_ctx_push(ctx, ne_top_level_elements, ctx);
 
-  r = ne_parse(ctx, NULL);
+  r = ne_parse(ctx, NULL, max_offset);
 
   if (r != 1) {
     nestegg_destroy(ctx);
@@ -1531,98 +1801,130 @@
 }
 
 int
-nestegg_track_seek(nestegg * ctx, unsigned int track, uint64_t tstamp)
+nestegg_get_cue_point(nestegg * ctx, unsigned int cluster_num, int64_t max_offset,
+                      int64_t * start_pos, int64_t * end_pos, uint64_t * tstamp)
 {
-  int r;
+  int range_obtained = 0;
+  unsigned int cluster_count = 0;
   struct cue_point * cue_point;
   struct cue_track_positions * pos;
-  struct saved_state state;
-  struct seek * found;
-  uint64_t seek_pos, tc_scale, t, id;
-  struct ebml_list_node * node = ctx->segment.cues.cue_point.head;
+  uint64_t seek_pos, track_number, tc_scale, time;
+  struct ebml_list_node * cues_node = ctx->segment.cues.cue_point.head;
+  struct ebml_list_node * cue_pos_node = NULL;
+  unsigned int track = 0, track_count = 0, track_index;
 
-  /* If there are no cues loaded, check for cues element in the seek head
-     and load it. */
-  if (!node) {
-    found = ne_find_seek_for_id(ctx->segment.seek_head.head, ID_CUES);
-    if (!found)
-      return -1;
+  if (!start_pos || !end_pos || !tstamp)
+    return -1;
 
-    if (ne_get_uint(found->position, &seek_pos) != 0)
-      return -1;
+  /* Initialise return values */
+  *start_pos = -1;
+  *end_pos = -1;
+  *tstamp = 0;
 
-    /* Save old parser state. */
-    r = ne_ctx_save(ctx, &state);
-    if (r != 0)
-      return -1;
-
-    /* Seek and set up parser state for segment-level element (Cues). */
-    r = ne_io_seek(ctx->io, ctx->segment_offset + seek_pos, NESTEGG_SEEK_SET);
-    if (r != 0)
-      return -1;
-    ctx->last_id = 0;
-    ctx->last_size = 0;
-
-    r = ne_read_element(ctx, &id, NULL);
-    if (r != 1)
-      return -1;
-
-    if (id != ID_CUES)
-      return -1;
-
-    ctx->ancestor = NULL;
-    ne_ctx_push(ctx, ne_top_level_elements, ctx);
-    ne_ctx_push(ctx, ne_segment_elements, &ctx->segment);
-    ne_ctx_push(ctx, ne_cues_elements, &ctx->segment.cues);
-    /* parser will run until end of cues element. */
-    ctx->log(ctx, NESTEGG_LOG_DEBUG, "seek: parsing cue elements");
-    r = ne_parse(ctx, ne_cues_elements);
-    while (ctx->ancestor)
-      ne_ctx_pop(ctx);
-
-    /* Reset parser state to original state and seek back to old position. */
-    if (ne_ctx_restore(ctx, &state) != 0)
-      return -1;
-
-    if (r < 0)
+  if (!cues_node) {
+    ne_init_cue_points(ctx, max_offset);
+    cues_node = ctx->segment.cues.cue_point.head;
+    /* Verify cues have been added to context. */
+    if (!cues_node)
       return -1;
   }
 
+  nestegg_track_count(ctx, &track_count);
+
   tc_scale = ne_get_timecode_scale(ctx);
 
-  cue_point = ne_find_cue_point_for_tstamp(ctx->segment.cues.cue_point.head, tc_scale, tstamp);
-  if (!cue_point)
-    return -1;
+  while (cues_node && !range_obtained) {
+    assert(cues_node->id == ID_CUE_POINT);
+    cue_point = cues_node->data;
+    cue_pos_node = cue_point->cue_track_positions.head;
+    while (cue_pos_node) {
+      assert(cue_pos_node->id == ID_CUE_TRACK_POSITIONS);
+      pos = cue_pos_node->data;
+      for (track = 0; track < track_count; track++) {
+        if (ne_get_uint(pos->track, &track_number) != 0)
+          return -1;
 
-  node = cue_point->cue_track_positions.head;
+        if (ne_map_track_number_to_index(ctx, track_number, &track_index) != 0)
+          return -1;
 
-  seek_pos = 0;
-
-  while (node) {
-    assert(node->id == ID_CUE_TRACK_POSITIONS);
-    pos = node->data;
-    if (ne_get_uint(pos->track, &t) == 0 && t - 1 == track) {
-      if (ne_get_uint(pos->cluster_position, &seek_pos) != 0)
-        return -1;
-      break;
+        if (track_index == track) {
+          if (ne_get_uint(pos->cluster_position, &seek_pos) != 0)
+            return -1;
+          if (cluster_count == cluster_num) {
+            *start_pos = ctx->segment_offset+seek_pos;
+            if (ne_get_uint(cue_point->time, &time) != 0)
+              return -1;
+            *tstamp = time * tc_scale;
+          } else if (cluster_count == cluster_num+1) {
+            *end_pos = (ctx->segment_offset+seek_pos)-1;
+            range_obtained = 1;
+            break;
+          }
+          cluster_count++;
+        }
+      }
+      cue_pos_node = cue_pos_node->next;
     }
-    node = node->next;
+    cues_node = cues_node->next;
   }
 
+  return 0;
+}
+
+int
+nestegg_offset_seek(nestegg * ctx, uint64_t offset)
+{
+  int r;
+
   /* Seek and set up parser state for segment-level element (Cluster). */
-  r = ne_io_seek(ctx->io, ctx->segment_offset + seek_pos, NESTEGG_SEEK_SET);
+  r = ne_io_seek(ctx->io, offset, NESTEGG_SEEK_SET);
   if (r != 0)
     return -1;
-  ctx->last_id = 0;
-  ctx->last_size = 0;
+  ctx->last_valid = 0;
 
   while (ctx->ancestor)
     ne_ctx_pop(ctx);
 
   ne_ctx_push(ctx, ne_top_level_elements, ctx);
   ne_ctx_push(ctx, ne_segment_elements, &ctx->segment);
+
+  return 0;
+}
+
+int
+nestegg_track_seek(nestegg * ctx, unsigned int track, uint64_t tstamp)
+{
+  int r;
+  struct cue_point * cue_point;
+  struct cue_track_positions * pos;
+  uint64_t seek_pos, tc_scale;
+
+  /* If there are no cues loaded, check for cues element in the seek head
+     and load it. */
+  if (!ctx->segment.cues.cue_point.head) {
+    r = ne_init_cue_points(ctx, -1);
+    if (r != 0)
+      return -1;
+  }
+
+  tc_scale = ne_get_timecode_scale(ctx);
+
+  cue_point = ne_find_cue_point_for_tstamp(ctx, ctx->segment.cues.cue_point.head,
+                                           track, tc_scale, tstamp);
+  if (!cue_point)
+    return -1;
+
+  pos = ne_find_cue_position_for_track(ctx, cue_point->cue_track_positions.head, track);
+  if (pos == NULL)
+    return -1;
+
+  if (ne_get_uint(pos->cluster_position, &seek_pos) != 0)
+    return -1;
+
+  /* Seek and set up parser state for segment-level element (Cluster). */
+  r = nestegg_offset_seek(ctx, ctx->segment_offset + seek_pos);
   ctx->log(ctx, NESTEGG_LOG_DEBUG, "seek: parsing cluster elements");
-  r = ne_parse(ctx, NULL);
+  r = ne_parse(ctx, NULL, -1);
   if (r != 1)
     return -1;
 
@@ -1676,6 +1978,9 @@
   if (strcmp(codec_id, TRACK_ID_VORBIS) == 0)
     return NESTEGG_CODEC_VORBIS;
 
+  if (strcmp(codec_id, TRACK_ID_OPUS) == 0)
+    return NESTEGG_CODEC_OPUS;
+
   return -1;
 }
 
@@ -1728,34 +2033,40 @@
   if (!entry)
     return -1;
 
-  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS)
+  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS
+    && nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_OPUS)
     return -1;
 
   if (ne_get_binary(entry->codec_private, &codec_private) != 0)
     return -1;
 
-  p = codec_private.data;
-  count = *p++ + 1;
+  if (nestegg_track_codec_id(ctx, track) == NESTEGG_CODEC_VORBIS) {
+      p = codec_private.data;
+      count = *p++ + 1;
 
-  if (count > 3)
-    return -1;
+      if (count > 3)
+        return -1;
 
-  i = 0;
-  total = 0;
-  while (--count) {
-    sizes[i] = ne_xiph_lace_value(&p);
-    total += sizes[i];
-    i += 1;
+      i = 0;
+      total = 0;
+      while (--count) {
+        sizes[i] = ne_xiph_lace_value(&p);
+        total += sizes[i];
+        i += 1;
+      }
+      sizes[i] = codec_private.length - total - (p - codec_private.data);
+
+      for (i = 0; i < item; ++i) {
+        if (sizes[i] > LIMIT_FRAME)
+          return -1;
+        p += sizes[i];
+      }
+      *data = p;
+      *length = sizes[item];
+  } else {
+    *data = codec_private.data;
+    *length = codec_private.length;
   }
-  sizes[i] = codec_private.length - total - (p - codec_private.data);
-
-  for (i = 0; i < item; ++i) {
-    if (sizes[i] > LIMIT_FRAME)
-      return -1;
-    p += sizes[i];
-  }
-  *data = p;
-  *length = sizes[item];
 
   return 0;
 }
@@ -1776,37 +2087,43 @@
   if (nestegg_track_type(ctx, track) != NESTEGG_TRACK_VIDEO)
     return -1;
 
+  value = 0;
+  ne_get_uint(entry->video.stereo_mode, &value);
+  if (value <= NESTEGG_VIDEO_STEREO_TOP_BOTTOM ||
+      value == NESTEGG_VIDEO_STEREO_RIGHT_LEFT)
+    params->stereo_mode = value;
+
   if (ne_get_uint(entry->video.pixel_width, &value) != 0)
     return -1;
-  params->width = (unsigned int)value;
+  params->width = value;
 
   if (ne_get_uint(entry->video.pixel_height, &value) != 0)
     return -1;
-  params->height = (unsigned int)value;
+  params->height = value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_bottom, &value);
-  params->crop_bottom = (unsigned int)value;
+  params->crop_bottom = value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_top, &value);
-  params->crop_top = (unsigned int)value;
+  params->crop_top = value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_left, &value);
-  params->crop_left = (unsigned int)value;
+  params->crop_left = value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_right, &value);
-  params->crop_right = (unsigned int)value;
+  params->crop_right = value;
 
   value = params->width;
   ne_get_uint(entry->video.display_width, &value);
-  params->display_width = (unsigned int)value;
+  params->display_width = value;
 
   value = params->height;
   ne_get_uint(entry->video.display_height, &value);
-  params->display_height = (unsigned int)value;
+  params->display_height = value;
 
   return 0;
 }
@@ -1832,11 +2149,19 @@
 
   value = 1;
   ne_get_uint(entry->audio.channels, &value);
-  params->channels = (unsigned int)value;
+  params->channels = value;
 
   value = 16;
   ne_get_uint(entry->audio.bit_depth, &value);
-  params->depth = (unsigned int)value;
+  params->depth = value;
+
+  value = 0;
+  ne_get_uint(entry->codec_delay, &value);
+  params->codec_delay = value;
+
+  value = 0;
+  ne_get_uint(entry->seek_preroll, &value);
+  params->seek_preroll = value;
 
   return 0;
 }
@@ -1854,19 +2179,26 @@
     if (r != 1)
       return r;
 
-    /* any suspend fields must be handled here */
+    /* Any DESC_FLAG_SUSPEND fields must be handled here. */
     if (ne_is_suspend_element(id)) {
       r = ne_read_element(ctx, &id, &size);
       if (r != 1)
         return r;
 
-      /* the only suspend fields are blocks and simple blocks, which we
+      /* The only DESC_FLAG_SUSPEND fields are Blocks and SimpleBlocks, which we
          handle directly. */
       r = ne_read_block(ctx, id, size, pkt);
+      if (r != 1)
+        return r;
+
+      r = ne_read_discard_padding(ctx, *pkt);
+      if (r != 1)
+        return r;
+
       return r;
     }
 
-    r =  ne_parse(ctx, NULL);
+    r =  ne_parse(ctx, NULL, -1);
     if (r != 1)
       return r;
   }
@@ -1892,7 +2224,7 @@
 int
 nestegg_packet_track(nestegg_packet * pkt, unsigned int * track)
 {
-  *track = (unsigned int)pkt->track;
+  *track = pkt->track;
   return 0;
 }
 
@@ -1904,6 +2236,13 @@
 }
 
 int
+nestegg_packet_discard_padding(nestegg_packet * pkt, int64_t * discard_padding)
+{
+  *discard_padding = pkt->discard_padding;
+  return 0;
+}
+
+int
 nestegg_packet_count(nestegg_packet * pkt, unsigned int * count)
 {
   struct frame * f = pkt->frame;
@@ -1940,3 +2279,28 @@
 
   return -1;
 }
+
+int
+nestegg_has_cues(nestegg * ctx)
+{
+  return ctx->segment.cues.cue_point.head ||
+         ne_find_seek_for_id(ctx->segment.seek_head.head, ID_CUES);
+}
+
+int
+nestegg_sniff(unsigned char const * buffer, size_t length)
+{
+  nestegg_io io;
+  struct sniff_buffer user_data;
+
+  user_data.buffer = buffer;
+  user_data.length = length;
+  user_data.offset = 0;
+
+  io.read = ne_buffer_read;
+  io.seek = ne_buffer_seek;
+  io.tell = ne_buffer_tell;
+  io.userdata = &user_data;
+  return ne_match_webm(io, length);
+}
+
diff --git a/nestegg/test/test.c b/third_party/nestegg/test/test.c
similarity index 88%
rename from nestegg/test/test.c
rename to third_party/nestegg/test/test.c
index 210b640..cc0753d 100644
--- a/nestegg/test/test.c
+++ b/third_party/nestegg/test/test.c
@@ -15,9 +15,10 @@
 #define SEEK_TEST
 
 static int
-stdio_read(void * p, size_t length, void * fp)
+stdio_read(void * p, size_t length, void * file)
 {
   size_t r;
+  FILE * fp = file;
 
   r = fread(p, length, 1, fp);
   if (r == 0 && feof(fp))
@@ -26,8 +27,9 @@
 }
 
 static int
-stdio_seek(int64_t offset, int whence, void * fp)
+stdio_seek(int64_t offset, int whence, void * file)
 {
+  FILE * fp = file;
   return fseek(fp, offset, whence);
 }
 
@@ -43,7 +45,7 @@
   va_list ap;
   char const * sev = NULL;
 
-#ifndef DEBUG
+#if !defined(DEBUG)
   if (severity < NESTEGG_LOG_WARNING)
     return;
 #endif
@@ -102,32 +104,32 @@
   io.userdata = fp;
 
   ctx = NULL;
-  r = nestegg_init(&ctx, io, log_callback);
+  r = nestegg_init(&ctx, io, log_callback, -1);
   if (r != 0)
     return EXIT_FAILURE;
 
   nestegg_track_count(ctx, &tracks);
   nestegg_duration(ctx, &duration);
-#ifdef DEBUG
+#if defined(DEBUG)
   fprintf(stderr, "media has %u tracks and duration %fs\n", tracks, duration / 1e9);
 #endif
 
   for (i = 0; i < tracks; ++i) {
     type = nestegg_track_type(ctx, i);
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "track %u: type: %d codec: %d", i,
             type, nestegg_track_codec_id(ctx, i));
 #endif
     nestegg_track_codec_data_count(ctx, i, &data_items);
     for (j = 0; j < data_items; ++j) {
       nestegg_track_codec_data(ctx, i, j, &codec_data, &length);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, " (%p, %u)", codec_data, (unsigned int) length);
 #endif
     }
     if (type == NESTEGG_TRACK_VIDEO) {
       nestegg_track_video_params(ctx, i, &vparams);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, " video: %ux%u (d: %ux%u %ux%ux%ux%u)",
               vparams.width, vparams.height,
               vparams.display_width, vparams.display_height,
@@ -135,23 +137,23 @@
 #endif
     } else if (type == NESTEGG_TRACK_AUDIO) {
       nestegg_track_audio_params(ctx, i, &aparams);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, " audio: %.2fhz %u bit %u channels",
               aparams.rate, aparams.depth, aparams.channels);
 #endif
     }
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "\n");
 #endif
   }
 
-#ifdef SEEK_TEST
-#ifdef DEBUG
+#if defined(SEEK_TEST)
+#if defined(DEBUG)
   fprintf(stderr, "seek to middle\n");
 #endif
   r = nestegg_track_seek(ctx, 0, duration / 2);
   if (r == 0) {
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "middle ");
 #endif
     r = nestegg_read_packet(ctx, &pkt);
@@ -159,23 +161,23 @@
       nestegg_packet_track(pkt, &track);
       nestegg_packet_count(pkt, &cnt);
       nestegg_packet_tstamp(pkt, &tstamp);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
 #endif
       nestegg_free_packet(pkt);
     } else {
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "middle seek failed\n");
 #endif
     }
   }
 
-#ifdef DEBUG
+#if defined(DEBUG)
   fprintf(stderr, "seek to ~end\n");
 #endif
   r = nestegg_track_seek(ctx, 0, duration - (duration / 10));
   if (r == 0) {
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "end ");
 #endif
     r = nestegg_read_packet(ctx, &pkt);
@@ -183,23 +185,23 @@
       nestegg_packet_track(pkt, &track);
       nestegg_packet_count(pkt, &cnt);
       nestegg_packet_tstamp(pkt, &tstamp);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
 #endif
       nestegg_free_packet(pkt);
     } else {
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "end seek failed\n");
 #endif
     }
   }
 
-#ifdef DEBUG
+#if defined(DEBUG)
   fprintf(stderr, "seek to ~start\n");
 #endif
   r = nestegg_track_seek(ctx, 0, duration / 10);
   if (r == 0) {
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "start ");
 #endif
     r = nestegg_read_packet(ctx, &pkt);
@@ -207,12 +209,12 @@
       nestegg_packet_track(pkt, &track);
       nestegg_packet_count(pkt, &cnt);
       nestegg_packet_tstamp(pkt, &tstamp);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "* t %u pts %f frames %u\n", track, tstamp / 1e9, cnt);
 #endif
       nestegg_free_packet(pkt);
     } else {
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "start seek failed\n");
 #endif
     }
@@ -224,17 +226,17 @@
     nestegg_packet_count(pkt, &pkt_cnt);
     nestegg_packet_tstamp(pkt, &pkt_tstamp);
 
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "t %u pts %f frames %u: ", pkt_track, pkt_tstamp / 1e9, pkt_cnt);
 #endif
 
     for (i = 0; i < pkt_cnt; ++i) {
       nestegg_packet_data(pkt, i, &ptr, &size);
-#ifdef DEBUG
+#if defined(DEBUG)
       fprintf(stderr, "%u ", (unsigned int) size);
 #endif
     }
-#ifdef DEBUG
+#if defined(DEBUG)
     fprintf(stderr, "\n");
 #endif
 
diff --git a/tools_common.c b/tools_common.c
index f0e1606..4f2ac74 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -8,13 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "tools_common.h"
-
+#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "./tools_common.h"
+
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
@@ -77,26 +78,6 @@
   exit(EXIT_FAILURE);
 }
 
-uint16_t mem_get_le16(const void *data) {
-  uint16_t val;
-  const uint8_t *mem = (const uint8_t*)data;
-
-  val = mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
-uint32_t mem_get_le32(const void *data) {
-  uint32_t val;
-  const uint8_t *mem = (const uint8_t*)data;
-
-  val = mem[3] << 24;
-  val |= mem[2] << 16;
-  val |= mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
   FILE *f = input_ctx->file;
   struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
@@ -273,3 +254,14 @@
   return 1;
 }
 
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+  static const double kMaxPSNR = 100.0;
+
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+  } else {
+    return kMaxPSNR;
+  }
+}
diff --git a/tools_common.h b/tools_common.h
index 2e90259..58894de 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -118,9 +118,6 @@
 /* The tool including this file must define usage_exit() */
 void usage_exit();
 
-uint16_t mem_get_le16(const void *data);
-uint32_t mem_get_le32(const void *data);
-
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
@@ -145,6 +142,8 @@
 void vpx_img_write(const vpx_image_t *img, FILE *file);
 int vpx_img_read(vpx_image_t *img, FILE *file);
 
+double sse_to_psnr(double samples, double peak, double mse);
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
diff --git a/video_reader.c b/video_reader.c
index 4be7483..39c7edb 100644
--- a/video_reader.c
+++ b/video_reader.c
@@ -14,6 +14,8 @@
 #include "./ivfdec.h"
 #include "./video_reader.h"
 
+#include "vpx_ports/mem_ops.h"
+
 static const char *const kIVFSignature = "DKIF";
 
 struct VpxVideoReaderStruct {
diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
index dc84c30..3991957 100644
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -53,7 +53,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -101,7 +101,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -127,7 +127,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
index adc353d..915ee49 100644
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@@ -51,7 +51,7 @@
     orr     r8, r8, r10         ; differences of all 4 pixels
     ; calculate total sum
     add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; substract negative differences from sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r7, r8              ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@
 
     ; calculate total sum
     add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; substract negative differences from sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r7, r8              ; byte (two pixels) to halfwords
diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
index dd2ce68..3668dc5 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -58,7 +58,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -89,7 +89,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -120,7 +120,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
index f972d9b..b4e0959 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -69,7 +69,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -111,7 +111,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -195,7 +195,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index f5da9c0..10863e2 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -59,7 +59,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -90,7 +90,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -121,7 +121,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -154,7 +154,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm
deleted file mode 100644
index bda4b96..0000000
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem16x16_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem16x16_neon| PROC
-
-    vld1.u8     {q0}, [r0], r1
-    vld1.u8     {q1}, [r0], r1
-    vld1.u8     {q2}, [r0], r1
-    vst1.u8     {q0}, [r2], r3
-    vld1.u8     {q3}, [r0], r1
-    vst1.u8     {q1}, [r2], r3
-    vld1.u8     {q4}, [r0], r1
-    vst1.u8     {q2}, [r2], r3
-    vld1.u8     {q5}, [r0], r1
-    vst1.u8     {q3}, [r2], r3
-    vld1.u8     {q6}, [r0], r1
-    vst1.u8     {q4}, [r2], r3
-    vld1.u8     {q7}, [r0], r1
-    vst1.u8     {q5}, [r2], r3
-    vld1.u8     {q8}, [r0], r1
-    vst1.u8     {q6}, [r2], r3
-    vld1.u8     {q9}, [r0], r1
-    vst1.u8     {q7}, [r2], r3
-    vld1.u8     {q10}, [r0], r1
-    vst1.u8     {q8}, [r2], r3
-    vld1.u8     {q11}, [r0], r1
-    vst1.u8     {q9}, [r2], r3
-    vld1.u8     {q12}, [r0], r1
-    vst1.u8     {q10}, [r2], r3
-    vld1.u8     {q13}, [r0], r1
-    vst1.u8     {q11}, [r2], r3
-    vld1.u8     {q14}, [r0], r1
-    vst1.u8     {q12}, [r2], r3
-    vld1.u8     {q15}, [r0], r1
-    vst1.u8     {q13}, [r2], r3
-    vst1.u8     {q14}, [r2], r3
-    vst1.u8     {q15}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem16x16_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm
deleted file mode 100644
index 35c0f67..0000000
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem8x4_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x4_neon| PROC
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vst1.u8     {d3}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem8x4_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm
deleted file mode 100644
index 1f5b941..0000000
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem8x8_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x8_neon| PROC
-
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vld1.u8     {d4}, [r0], r1
-    vst1.u8     {d3}, [r2], r3
-    vld1.u8     {d5}, [r0], r1
-    vst1.u8     {d4}, [r2], r3
-    vld1.u8     {d6}, [r0], r1
-    vst1.u8     {d5}, [r2], r3
-    vld1.u8     {d7}, [r0], r1
-    vst1.u8     {d6}, [r2], r3
-    vst1.u8     {d7}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem8x8_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem_neon.c b/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 0000000..deced11
--- /dev/null
+++ b/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 4; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem8x8_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 8; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem16x16_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    int r;
+    uint8x16_t qtmp;
+
+    for (r = 0; r < 16; r++) {
+        qtmp = vld1q_u8(src);
+        vst1q_u8(dst, qtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
deleted file mode 100644
index 79ff02c..0000000
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,54 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-;                            int pred_stride, unsigned char *dst_ptr,
-;                            int dst_stride)
-
-; r0  input_dc
-; r1  pred_ptr
-; r2  pred_stride
-; r3  dst_ptr
-; sp  dst_stride
-
-|vp8_dc_only_idct_add_neon| PROC
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    ldr             r12, [sp]
-    vdup.16         q0, r0
-
-    vld1.32         {d2[0]}, [r1], r2
-    vld1.32         {d2[1]}, [r1], r2
-    vld1.32         {d4[0]}, [r1], r2
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r3], r12
-    vst1.32         {d2[1]}, [r3], r12
-    vst1.32         {d4[0]}, [r3], r12
-    vst1.32         {d4[1]}, [r3]
-
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.c b/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 0000000..ad5f41d
--- /dev/null
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+        int16_t input_dc,
+        unsigned char *pred_ptr,
+        int pred_stride,
+        unsigned char *dst_ptr,
+        int dst_stride) {
+    int i;
+    uint16_t a1 = ((input_dc + 4) >> 3);
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint8x8_t d2u8;
+    uint16x8_t q1u16;
+    uint16x8_t qAdd;
+
+    qAdd = vdupq_n_u16(a1);
+
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+        pred_ptr += pred_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+        pred_ptr += pred_stride;
+
+        q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+        dst_ptr += dst_stride;
+    }
+}
diff --git a/vp8/common/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm
deleted file mode 100644
index 602cce6..0000000
--- a/vp8/common/arm/neon/dequant_idct_neon.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_add_neon(short *input, short *dq,
-;                           unsigned char *dest, int stride)
-; r0    short *input,
-; r1    short *dq,
-; r2    unsigned char *dest
-; r3    int stride
-
-|vp8_dequant_idct_add_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-
-    add             r1, r2, r3              ; r1 = dest + stride
-    lsl             r3, #1                  ; 2x stride
-
-    vld1.32         {d14[0]}, [r2], r3
-    vld1.32         {d14[1]}, [r1], r3
-    vld1.32         {d15[0]}, [r2]
-    vld1.32         {d15[1]}, [r1]
-
-    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vaddw.u8        q1, q1, d14
-    vaddw.u8        q2, q2, d15
-
-    sub             r2, r2, r3
-    sub             r1, r1, r3
-
-    vqmovun.s16     d0, q1
-    vqmovun.s16     d1, q2
-
-    vst1.32         {d0[0]}, [r2], r3
-    vst1.32         {d0[1]}, [r1], r3
-    vst1.32         {d1[0]}, [r2]
-    vst1.32         {d1[1]}, [r1]
-
-    bx             lr
-
-    ENDP           ; |vp8_dequant_idct_add_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2       DCD 0x8a8c8a8c
-
-    END
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 0000000..58e1192
--- /dev/null
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 35468;
+
+void vp8_dequant_idct_add_neon(
+        int16_t *input,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int32x2_t d14, d15;
+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+    int16x8_t q1, q2, q3, q4, q5, q6;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x2x2_t d2tmp0, d2tmp1;
+    int16x4x2_t d2tmp2, d2tmp3;
+
+    d14 = d15 = vdup_n_s32(0);
+
+    // load input
+    q3 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+    input += 8;
+    q4 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+
+    // load dq
+    q5 = vld1q_s16(dq);
+    dq += 8;
+    q6 = vld1q_s16(dq);
+
+    // load src from dst
+    dst0 = dst;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+                                         vreinterpretq_u16_s16(q5)));
+    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+                                         vreinterpretq_u16_s16(q6)));
+
+    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    // loop 2
+    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2 = vrshr_n_s16(d2, 3);
+    d3 = vrshr_n_s16(d3, 3);
+    d4 = vrshr_n_s16(d4, 3);
+    d5 = vrshr_n_s16(d5, 3);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+                                        vreinterpret_u8_s32(d14)));
+    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+                                        vreinterpret_u8_s32(d15)));
+
+    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 1);
+    return;
+}
diff --git a/vp8/common/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm
deleted file mode 100644
index c8e0c31..0000000
--- a/vp8/common/arm/neon/dequantizeb_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequantize_b_loop_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    short *Q,
-; r1    short *DQC
-; r2    short *DQ
-|vp8_dequantize_b_loop_neon| PROC
-    vld1.16         {q0, q1}, [r0]
-    vld1.16         {q2, q3}, [r1]
-
-    vmul.i16        q4, q0, q2
-    vmul.i16        q5, q1, q3
-
-    vst1.16         {q4, q5}, [r2]
-
-    bx             lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/dequantizeb_neon.c b/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 0000000..60f69c8
--- /dev/null
+++ b/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dequantize_b_loop_neon(
+        int16_t *Q,
+        int16_t *DQC,
+        int16_t *DQ) {
+    int16x8x2_t qQ, qDQC, qDQ;
+
+    qQ   = vld2q_s16(Q);
+    qDQC = vld2q_s16(DQC);
+
+    qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+    qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+    vst2q_s16(DQ, qDQ);
+    return;
+}
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index f388d24..88a07b9 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -527,7 +527,7 @@
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04
@@ -1289,7 +1289,7 @@
         pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
         pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
 
-        ; tranpose and write back
+        ; transpose and write back
         movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
         movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
 
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index a66753b..1913abc 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -958,7 +958,7 @@
         ; start work on filters
         B_FILTER 2
 
-        ; tranpose and write back - only work on q1, q0, p0, p1
+        ; transpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
         ; store 16-line result
 
@@ -1023,7 +1023,7 @@
         ; start work on filters
         B_FILTER 2
 
-        ; tranpose and write back - only work on q1, q0, p0, p1
+        ; transpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
 
         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index afcda9f..98e5a71 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -2444,10 +2444,10 @@
         find_next_key_frame(cpi, &this_frame_copy);
 
         /* Special case: Error error_resilient_mode mode does not make much
-         * sense for two pass but with its current meaning but this code is
+         * sense for two pass but with its current meaning this code is
          * designed to stop outlandish behaviour if someone does set it when
          * using two pass. It effectively disables GF groups. This is
-         * temporary code till we decide what should really happen in this
+         * temporary code until we decide what should really happen in this
          * case.
          */
         if (cpi->oxcf.error_resilient_mode)
@@ -2773,7 +2773,7 @@
         kf_group_intra_err += this_frame->intra_error;
         kf_group_coded_err += this_frame->coded_error;
 
-        /* load a the next frame's stats */
+        /* Load the next frame's stats. */
         vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
         input_stats(cpi, this_frame);
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 07138ec..849a0ed 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -19,7 +19,7 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "psnr.h"
+#include "vpx/internal/vpx_psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
@@ -2170,10 +2170,12 @@
                                               8.0 / 1000.0  / time_encoded;
                         double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
                                          lst_yv12->y_width * lst_yv12->y_height;
-                        double total_psnr = vp8_mse2psnr(samples, 255.0,
-                                                  cpi->total_error2[i]);
-                        double total_psnr2 = vp8_mse2psnr(samples, 255.0,
-                                                  cpi->total_error2_p[i]);
+                        double total_psnr =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2[i]);
+                        double total_psnr2 =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2_p[i]);
                         double total_ssim = 100 * pow(cpi->sum_ssim[i] /
                                                       cpi->sum_weights[i], 8.0);
 
@@ -2190,9 +2192,9 @@
                 {
                     double samples = 3.0 / 2 * cpi->count *
                                         lst_yv12->y_width * lst_yv12->y_height;
-                    double total_psnr = vp8_mse2psnr(samples, 255.0,
-                                                         cpi->total_sq_error);
-                    double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+                    double total_psnr = vpx_sse_to_psnr(samples, 255.0,
+                                                        cpi->total_sq_error);
+                    double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
                                                          cpi->total_sq_error2);
                     double total_ssim = 100 * pow(cpi->summed_quality /
                                                       cpi->summed_weights, 8.0);
@@ -2522,8 +2524,8 @@
     pkt.data.psnr.samples[3] = width * height;
 
     for (i = 0; i < 4; i++)
-        pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                             (double)(pkt.data.psnr.sse[i]));
+        pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0,
+                                                (double)(pkt.data.psnr.sse[i]));
 
     vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
@@ -2681,8 +2683,8 @@
     VP8_COMMON *cm = &cpi->common;
 
     /* Do we need to apply resampling for one pass cbr.
-     * In one pass this is more limited than in two pass cbr
-     * The test and any change is only made one per key frame sequence
+     * In one pass this is more limited than in two pass cbr.
+     * The test and any change is only made once per key frame sequence.
      */
     if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
     {
@@ -2705,7 +2707,7 @@
             cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
         }
 
-        /* Get the new hieght and width */
+        /* Get the new height and width */
         Scale2Ratio(cm->horiz_scale, &hr, &hs);
         Scale2Ratio(cm->vert_scale, &vr, &vs);
         new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
@@ -5284,11 +5286,11 @@
 
                 sq_error = (double)(ye + ue + ve);
 
-                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+                frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error);
 
-                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
-                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
-                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
+                cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye);
+                cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue);
+                cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
 #if CONFIG_POSTPROC
@@ -5311,14 +5313,14 @@
 
                     sq_error2 = (double)(ye + ue + ve);
 
-                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
+                    frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2);
 
-                    cpi->totalp_y += vp8_mse2psnr(y_samples,
-                                                  255.0, (double)ye);
-                    cpi->totalp_u += vp8_mse2psnr(uv_samples,
-                                                  255.0, (double)ue);
-                    cpi->totalp_v += vp8_mse2psnr(uv_samples,
-                                                  255.0, (double)ve);
+                    cpi->totalp_y += vpx_sse_to_psnr(y_samples,
+                                                     255.0, (double)ye);
+                    cpi->totalp_u += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ue);
+                    cpi->totalp_v += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ve);
                     cpi->total_sq_error2 += sq_error2;
                     cpi->totalp  += frame_psnr2;
 
diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
deleted file mode 100644
index b3a3d95..0000000
--- a/vp8/encoder/psnr.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "math.h"
-#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
-
-#define MAX_PSNR 100
-
-double vp8_mse2psnr(double Samples, double Peak, double Mse)
-{
-    double psnr;
-
-    if ((double)Mse > 0.0)
-        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-    else
-        psnr = MAX_PSNR;      /* Limit to prevent / 0 */
-
-    if (psnr > MAX_PSNR)
-        psnr = MAX_PSNR;
-
-    return psnr;
-}
diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h
deleted file mode 100644
index 0c6c088..0000000
--- a/vp8/encoder/psnr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_ENCODER_PSNR_H_
-#define VP8_ENCODER_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_ENCODER_PSNR_H_
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 7e3af71..513b2bf 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -16,7 +16,6 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index a2127c9..ac91d7a 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -159,10 +159,6 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
@@ -177,10 +173,8 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/save_reg_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
@@ -189,6 +183,10 @@
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index ce789e2..4c896b1 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1265,6 +1265,7 @@
         "vp8.fpf"           /* first pass filename */
 #endif
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+        {0},                /* ss_target_bitrate */
         1,                  /* ts_number_layers */
         {0},                /* ts_target_bitrate */
         {0},                /* ts_rate_decimator */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 871b8d3..0b4c4cb 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -929,6 +929,7 @@
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NOT_IMPLEMENTED,
     },
     { /* encoder functions */
         NOT_IMPLEMENTED,
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index cd091f3..d7c6dd1 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -50,7 +50,6 @@
 VP8_CX_SRCS-yes += encoder/modecosts.h
 VP8_CX_SRCS-yes += encoder/onyx_int.h
 VP8_CX_SRCS-yes += encoder/pickinter.h
-VP8_CX_SRCS-yes += encoder/psnr.h
 VP8_CX_SRCS-yes += encoder/quantize.h
 VP8_CX_SRCS-yes += encoder/ratectrl.h
 VP8_CX_SRCS-yes += encoder/rdopt.h
@@ -61,7 +60,6 @@
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/psnr.c
 VP8_CX_SRCS-yes += encoder/quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
index 388a7d7..72e933e 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
@@ -72,7 +72,7 @@
     ;   reg1 = output[first_offset]
     ;   reg2 = output[second_offset]
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -88,7 +88,7 @@
     ;   output[first_offset] = reg1
     ;   output[second_offset] = reg2
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -242,7 +242,7 @@
     ; TODO(cd): have special case to re-use constants when they are similar for
     ;           consecutive butterflies
     ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/substractions before the multiplies.
+    ;           additions/subtractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
     mov             r8,  #$first_constant  & 0xFF00
@@ -260,7 +260,7 @@
     vmull.s16 q11, $regB, d31
     vmull.s16 q12, $regC, d31
     ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/substractions (to get back two register)
+    ; do some addition/subtractions (to get back two register)
     vsub.s32  q8, q8, q10
     vsub.s32  q9, q9, q11
     ; do more multiplications (ordered for maximum latency hiding)
@@ -268,7 +268,7 @@
     vmull.s16 q11, $regA, d30
     vmull.s16 q15, $regB, d30
     ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/substractions
+    ; do more addition/subtractions
     vadd.s32  q11, q12, q11
     vadd.s32  q10, q10, q15
     ; (used) four for intermediate (q8-q11)
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
index 93d3af3..b41f566 100644
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -576,6 +576,7 @@
     vld1.s16        {q14,q15}, [r0]!
 
     push            {r0-r10}
+    vpush           {d8-d15}
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -636,6 +637,7 @@
     IADST8X8_1D
 
 end_vp9_iht8x8_64_add_neon
+    vpop           {d8-d15}
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 4a49964..dc9856f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -315,8 +315,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
 
@@ -327,8 +327,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
@@ -372,10 +372,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -394,10 +394,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -445,10 +445,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d20[2]                  ; proload next 2 rows data
     vdup.16             q8, d20[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -459,10 +459,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[0]                  ; proload next 2 rows data
     vdup.16             q8, d21[1]
     vst1.64             {d2,d3}, [r0], r1
@@ -472,10 +472,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[2]                  ; proload next 2 rows data
     vdup.16             q8, d21[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -486,12 +486,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
-    vdup.16             q0, d20[2]
-    vdup.16             q8, d20[3]
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
     vmovl.u8            q10, d18
     vst1.64             {d2,d3}, [r0], r1
@@ -544,19 +542,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q1, d6[2]
     vdup.16             q2, d6[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -566,19 +564,19 @@
     vadd.s16            q13, q1, q9
     vadd.s16            q14, q1, q10
     vadd.s16            q15, q1, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[0]
     vdup.16             q2, d7[1]
     vst1.64             {d24-d27}, [r0], r1
@@ -588,19 +586,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[2]
     vdup.16             q2, d7[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -610,20 +608,20 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
     vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vmovl.u8            q3, d0
     vst1.64             {d24-d27}, [r0], r1
 
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
deleted file mode 100644
index 536febb..0000000
--- a/vp9/common/generic/vp9_systemdependent.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
-  vp9_rtcd();
-}
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 6f77199..ff4b7c1 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -201,7 +201,7 @@
 }
 
 void vp9_create_common(VP9_COMMON *cm) {
-  vp9_machine_specific_config(cm);
+  vp9_rtcd();
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index d918bed..e1d1318 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -16,8 +16,7 @@
     if (!left_mi || is_inter_block(&left_mi->mbmi))
       return DC_PRED;
 
-    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                             : left_mi->mbmi.mode;
+    return get_y_mode(left_mi, b + 1);
   } else {
     assert(b == 1 || b == 3);
     return cur_mi->bmi[b - 1].as_mode;
@@ -30,8 +29,7 @@
     if (!above_mi || is_inter_block(&above_mi->mbmi))
       return DC_PRED;
 
-    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
-                                              : above_mi->mbmi.mode;
+    return get_y_mode(above_mi, b + 2);
   } else {
     assert(b == 2 || b == 3);
     return cur_mi->bmi[b - 2].as_mode;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6086323..2a0ebfb 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -144,6 +144,11 @@
   b_mode_info bmi[4];
 } MODE_INFO;
 
+static INLINE MB_PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                      : mi->mbmi.mode;
+}
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -255,13 +260,11 @@
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
     return DCT_DCT;
 
-  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? mi->bmi[ib].as_mode
-                                                 : mbmi->mode];
+  return mode2txfm_map[get_y_mode(mi, ib)];
 }
 
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
@@ -328,13 +331,6 @@
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
-
-static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
-                             TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 13e954e..bc12f9a 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = {
+const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
   4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,11 +85,11 @@
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = {
+const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]) = {
+const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index d6b380f..bd5086a 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -42,7 +42,7 @@
 
 #define ENTROPY_NODES 11
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 
 #define EOB_MODEL_TOKEN 3
 extern const vp9_tree_index vp9_coefmodel_tree[];
@@ -116,8 +116,8 @@
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
 
 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
@@ -177,13 +177,11 @@
 static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                          PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (is_inter_block(mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
     return &vp9_default_scan_orders[tx_size];
   } else {
-    const MB_PREDICTION_MODE mode =
-        mbmi->sb_type < BLOCK_8X8 ? mi->bmi[block_idx].as_mode : mbmi->mode;
+    const MB_PREDICTION_MODE mode = get_y_mode(mi, block_idx);
     return &vp9_scan_orders[tx_size][mode2txfm_map[mode]];
   }
 }
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 25cba7f..8921539 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -465,8 +465,10 @@
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
-  vpx_memset(cm->prev_mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+  if (frame_is_intra_only(cm))
+    vpx_memset(cm->prev_mip, 0,
+               cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+
   vpx_memset(cm->mip, 0,
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 546f603..7474a88 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -10,12 +10,9 @@
 
 #include <assert.h>
 
-#include "vpx_ports/mem.h"
-
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -35,8 +32,7 @@
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -56,8 +52,7 @@
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -77,8 +72,7 @@
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 15610d7..29d3867 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -13,6 +13,8 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,10 +39,14 @@
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
-extern const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]);
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.
diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index d903ed6..dffeb8a 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -42,7 +42,7 @@
   int i;
   InternalFrameBufferList *const int_fb_list =
       (InternalFrameBufferList *)cb_priv;
-  if (int_fb_list == NULL || fb == NULL)
+  if (int_fb_list == NULL)
     return -1;
 
   // Find a free frame buffer.
@@ -73,12 +73,8 @@
 }
 
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
-  InternalFrameBuffer *int_fb;
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
-  if (fb == NULL)
-    return -1;
-
-  int_fb = (InternalFrameBuffer *)fb->priv;
   int_fb->in_use = 0;
   return 0;
 }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 04f8934..868a66a 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -262,9 +262,9 @@
     int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
-                  ? data
-                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
     }
 
     if (!lf->mode_ref_delta_enabled) {
@@ -868,7 +868,6 @@
   assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 
-#if CONFIG_NON420
 static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
                      const MB_MODE_INFO *mbmi) {
   const int seg = mbmi->segment_id;
@@ -1046,7 +1045,6 @@
     dst->buf += 8 * dst->stride;
   }
 }
-#endif
 
 void vp9_filter_block_plane(VP9_COMMON *const cm,
                             struct macroblockd_plane *const plane,
@@ -1206,10 +1204,8 @@
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   LOOP_FILTER_MASK lfm;
-#if CONFIG_NON420
   int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
       xd->plane[1].subsampling_x == 1);
-#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
@@ -1220,22 +1216,16 @@
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-#if CONFIG_NON420
       if (use_420)
-#endif
         vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col,
                        cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
-#if CONFIG_NON420
         if (use_420)
-#endif
           vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
-#if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
                                     mi_row, mi_col);
-#endif
       }
     }
   }
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index ff02622..e5f3fed 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -186,17 +186,17 @@
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col) {
+static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             const TileInfo *const tile,
+                             MODE_INFO *mi, const MODE_INFO *prev_mi,
+                             MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block_idx, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+  const MB_MODE_INFO *const prev_mbmi = cm->coding_use_prev_mi && prev_mi ?
+      &prev_mi->mbmi : NULL;
   int different_ref_found = 0;
   int context_counter = 0;
 
@@ -290,6 +290,16 @@
     clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
 
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
+                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
+                                    MV_REFERENCE_FRAME ref_frame,
+                                    int_mv *mv_ref_list,
+                                    int mi_row, int mi_col) {
+  find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col);
+}
+
 static void lower_mv_precision(MV *mv, int allow_hp) {
   const int use_hp = allow_hp && vp9_use_mv_hp(mv);
   if (!use_hp) {
@@ -324,8 +334,8 @@
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
-                       mv_list, block, mi_row, mi_col);
+  find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
+                   mv_list, block, mi_row, mi_col);
 
   near->as_int = 0;
   switch (block) {
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index f99952f..04cb000 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -17,35 +17,9 @@
 extern "C" {
 #endif
 
-
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col);
-
-static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
-                                    MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
-                       mv_ref_list, -1, mi_row, mi_col);
-}
-
-#define LEFT_TOP_MARGIN     ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
-
-// check a list of motion vectors by sad score using a number rows of pixels
-// above and a number cols of pixels in the left to select the one with best
-// score to use as ref motion vector
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near);
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
 
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
@@ -55,6 +29,19 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, const MODE_INFO *prev_mi,
+                      MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest, int_mv *near);
+
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index ac39a98..2220868 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -112,7 +112,6 @@
     int auto_key;  // autodetect cut scenes and set the keyframes
     int key_freq;  // maximum distance to key frame.
 
-    int allow_lag;  // allow lagged compression (if 0 lagin frames is ignored)
     int lag_in_frames;  // how many frames lag before we start encoding
 
     // ----------------------------------------------------------------
@@ -150,6 +149,8 @@
     // Spatial and temporal scalability.
     int ss_number_layers;  // Number of spatial layers.
     int ts_number_layers;  // Number of temporal layers.
+    // Bitrate allocation for spatial layers.
+    int ss_target_bitrate[VPX_SS_MAX_LAYERS];
     // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
     int ts_target_bitrate[VPX_TS_MAX_LAYERS];
     int ts_rate_decimator[VPX_TS_MAX_LAYERS];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 97983c5..e6d6ea7 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -224,6 +224,11 @@
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
+  // Flag indicates if prev_mi can be used in coding:
+  //   0: encoder assumes decoder does not have prev_mi
+  //   1: encoder assumes decoder has and uses prev_mi
+  unsigned int coding_use_prev_mi;
+
   int log2_tile_cols, log2_tile_rows;
 
   // Private data associated with the frame buffer callbacks.
@@ -302,7 +307,6 @@
 static void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode &&
                                        !cm->intra_only &&
                                        cm->last_show_frame;
   // Special case: set prev_mi to NULL when the previous mode info
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 7f9e563..6c7a0d3 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -39,7 +39,7 @@
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index f9bc06e..a1befc6 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c
@@ -10,7 +10,7 @@
 
 #include "vp9/common/vp9_prob.h"
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
+const uint8_t vp9_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 9fef8b1..def1255 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -134,9 +134,9 @@
                    int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    return seg->abs_delta == SEGMENT_ABSDATA ?
-                             data :  // Abs value
-                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 7576e7b..df603ad 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -139,9 +139,6 @@
   return clamped_mv;
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                    int bw, int bh,
                                    int x, int y, int w, int h,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 96ba3e4..71a41a9 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -382,34 +382,34 @@
       /* slower path if the block needs border extension */
       if (x0 + 2 * bs <= frame_width) {
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, 2 * bs + 1);
+          vpx_memcpy(above_row, above_ref, 2 * bs);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 + bs <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         }
-        above_row[-1] = left_available ? above_ref[-1] : 129;
       }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
     } else {
       /* faster path if the block does not need extension */
       if (bs == 4 && right_available && left_available) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 6317103..83ee69b 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -386,7 +386,7 @@
 specialize vp9_variance4x4 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
@@ -416,7 +416,7 @@
 specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
@@ -683,7 +683,7 @@
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
+prototype int64_t vp9_block_error "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
 specialize vp9_block_error $sse2_x86inc
 
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
@@ -737,7 +737,7 @@
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
+prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index 7455abc..72edbca5 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -76,9 +76,6 @@
 }
 #endif
 
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *cm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index a2cf910..1b4904c 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -142,20 +142,29 @@
 #if HAVE_AVX2
 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
 #define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
@@ -183,12 +192,26 @@
 FUN_CONV_2D(, avx2);
 #endif
 #if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index a7f6930..91055b9f 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -527,7 +527,7 @@
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index 0ffb1bc..efa960c 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -12,22 +12,25 @@
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= {
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
 
-DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= {
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
 
-DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= {
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
 
-DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= {
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
 
 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
                                   unsigned int src_pixels_per_line,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..cf28d8d
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,490 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 8 bytes
+    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 16 bytes
+    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+    // load the next 16 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // load the next 16 bytes in stride of two/three src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+    // merge the result together
+    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // load the next 16 bytes in stride of four/five src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index b1900e6..56b993d 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -15,6 +15,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -39,20 +40,16 @@
 #include "vp9/decoder/vp9_reader.h"
 #include "vp9/decoder/vp9_thread.h"
 
-static int read_be32(const uint8_t *p) {
-  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
-}
-
 static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
   for (i = 1; i < REFS_PER_FRAME; ++i)
-    if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
       return 1;
 
   return 0;
 }
 
-static void setup_compound_reference(VP9_COMMON *cm) {
+static void setup_compound_reference_mode(VP9_COMMON *cm) {
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -116,33 +113,34 @@
       vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
+                                                vp9_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    REFERENCE_MODE mode = vp9_read_bit(r);
-    if (mode)
-      mode += vp9_read_bit(r);
-    setup_compound_reference(cm);
-    return mode;
+    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+                                              : COMPOUND_REFERENCE)
+                           : SINGLE_REFERENCE;
   } else {
     return SINGLE_REFERENCE;
   }
 }
 
-static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+  FRAME_CONTEXT *const fc = &cm->fc;
   int i;
+
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_inter_prob[i]);
 
   if (cm->reference_mode != COMPOUND_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++) {
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]);
     }
 
   if (cm->reference_mode != SINGLE_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+    for (i = 0; i < REF_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
 static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
@@ -289,10 +287,8 @@
   MACROBLOCKD *const xd = args->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_PREDICTION_MODE mode = (plane == 0)
-          ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode
-                                            : mi->mbmi.mode)
-          : mi->mbmi.uv_mode;
+  const MB_PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
+                                               : mi->mbmi.uv_mode;
   int x, y;
   uint8_t *dst;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
@@ -350,9 +346,9 @@
 
   xd->mi_8x8 = cm->mi_grid_visible + offset;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
-  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
+
+  xd->last_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+      xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
@@ -836,7 +832,7 @@
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    size = read_be32(*data);
+    size = mem_get_be32(*data);
     *data += 4;
 
     if (size > (size_t)(data_end - *data))
@@ -1120,6 +1116,12 @@
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+
+    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+
     ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
@@ -1203,9 +1205,11 @@
   }
 
   if (!cm->error_resilient_mode) {
+    cm->coding_use_prev_mi = 1;
     cm->refresh_frame_context = vp9_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
+    cm->coding_use_prev_mi = 0;
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
   }
@@ -1263,8 +1267,10 @@
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-    cm->reference_mode = read_reference_mode(cm, &r);
-    read_reference_mode_probs(cm, &r);
+    cm->reference_mode = read_frame_reference_mode(cm, &r);
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
@@ -1373,7 +1379,10 @@
   alloc_tile_storage(pbi, tile_rows, tile_cols);
 
   xd->mode_info_stride = cm->mode_info_stride;
-  set_prev_mi(cm);
+  if (cm->coding_use_prev_mi)
+    set_prev_mi(cm);
+  else
+    cm->prev_mi = NULL;
 
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 856c8b5..0fb7a15 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -257,13 +257,18 @@
   mv->col = ref->col + diff.col;
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                          vp9_reader *r) {
-  const int ctx = vp9_get_reference_mode_context(cm, xd);
-  const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.comp_inter[ctx][mode];
-  return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                vp9_reader *r) {
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = vp9_get_reference_mode_context(cm, xd);
+    const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
 }
 
 // Read the referncence frame
@@ -277,10 +282,7 @@
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const REFERENCE_MODE mode = (cm->reference_mode == REFERENCE_MODE_SELECT)
-                                      ? read_reference_mode(cm, xd, r)
-                                      : cm->reference_mode;
-
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
@@ -356,6 +358,11 @@
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP &&
+         mv->col > MV_LOW && mv->col < MV_UPP;
+}
+
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
@@ -367,14 +374,10 @@
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
-              &cm->fc.nmvc, mv_counts, allow_hp);
-      if (is_compound)
-        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
-                &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
-        ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
-        ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+                allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
       }
       break;
     }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 01ee92f..14600e8 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,27 +14,27 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
 
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 #ifdef ENTROPY_STATS
-vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
 extern unsigned int active_section;
 #endif
 
@@ -62,15 +62,8 @@
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static INLINE void write_be32(uint8_t *p, int value) {
-  p[0] = value >> 24;
-  p[1] = value >> 16;
-  p[2] = value >> 8;
-  p[3] = value;
-}
-
-void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
-                             int data, int max) {
+static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+                                int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
@@ -116,15 +109,14 @@
   }
 }
 
-void vp9_update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
   int k;
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     prob_diff_update(vp9_switchable_interp_tree,
@@ -132,9 +124,8 @@
                      cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer* const w,
-                           TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop) {
+static void pack_mb_tokens(vp9_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *stop) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
@@ -201,45 +192,40 @@
 }
 
 // This function encodes the reference frame
-static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
-  const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (!seg_ref_active) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_reference_mode_prob(cm, xd));
+      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-             (cm->reference_mode == SINGLE_REFERENCE));
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
-    if (mi->ref_frame[1] > INTRA_FRAME) {
-      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
+    if (is_compound) {
+      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
-      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(cm, xd));
-      if (mi->ref_frame[0] != LAST_FRAME)
-        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(cm, xd));
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+      }
     }
-  } else {
-    assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
-           mi->ref_frame[0]);
   }
-
-  // If using the prediction model we have nothing further to do because
-  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -247,15 +233,15 @@
   const nmv_context *nmvc = &cm->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &cm->seg;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
-  const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1];
+  const struct segmentation *const seg = &cm->seg;
+  const MB_MODE_INFO *const mi = &m->mbmi;
+  const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
+  const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
-  int skip;
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
+  int skip;
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -276,15 +262,15 @@
   skip = write_skip(cpi, segment_id, m, bc);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, rf != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
+    vp9_write(bc, ref0 != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(rf != INTRA_FRAME &&
+      !(ref0 != INTRA_FRAME &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
     write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
   }
 
-  if (rf == INTRA_FRAME) {
+  if (ref0 == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
@@ -305,8 +291,8 @@
     write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
-    encode_ref_frame(cpi, bc);
-    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
+    write_ref_frames(cpi, bc);
+    mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
 
 #ifdef ENTROPY_STATS
     active_section = 3;
@@ -316,7 +302,7 @@
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[rf]][INTER_OFFSET(mode)];
+        ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(mode)];
       }
     }
 
@@ -336,21 +322,19 @@
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_inter_mode(bc, blockmode, mv_ref_p);
-          ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [INTER_OFFSET(blockmode)];
-
-          if (blockmode == NEWMV) {
+          const MB_PREDICTION_MODE b_mode = m->bmi[j].as_mode;
+          write_inter_mode(bc, b_mode, mv_ref_p);
+          ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(b_mode)];
+          if (b_mode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
             vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                          &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
             if (has_second_ref(mi))
               vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                            &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -359,11 +343,11 @@
       active_section = 5;
 #endif
       vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                    &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
       if (has_second_ref(mi))
         vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                      &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
     }
   }
 }
@@ -550,16 +534,6 @@
             coef_probs[i][j][k][l][m] = get_binary_prob(
                                             coef_branch_ct[i][j][k][l][m][0],
                                             coef_branch_ct[i][j][k][l][m][1]);
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing) {
-            int t;
-            for (t = 0; t < ENTROPY_TOKENS; ++t)
-              context_counters[tx_size][i][j][k][l][t] +=
-                  coef_counts[i][j][k][l][t];
-            context_counters[tx_size][i][j][k][l][ENTROPY_TOKENS] +=
-                eob_branch_ct[i][j][k][l];
-          }
-#endif
         }
       }
     }
@@ -638,10 +612,6 @@
                 if (s > 0 && newp != *oldp)
                   u = 1;
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -693,10 +663,6 @@
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
-#ifdef ENTROPY_STATS
-                  if (!cpi->dummy_packing)
-                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                   continue;
                 }
                 if (u == 1 && updates == 1) {
@@ -707,10 +673,6 @@
                     vp9_write(bc, 0, upd);
                 }
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -732,7 +694,7 @@
   }
 }
 
-static void update_coef_probs(VP9_COMP* cpi, vp9_writer* w) {
+static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
@@ -855,10 +817,10 @@
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
-            vp9_encode_unsigned_max(wb, abs(data), data_max);
+            encode_unsigned_max(wb, abs(data), data_max);
             vp9_wb_write_bit(wb, data < 0);
           } else {
-            vp9_encode_unsigned_max(wb, data, data_max);
+            encode_unsigned_max(wb, data, data_max);
           }
         }
       }
@@ -867,9 +829,7 @@
 }
 
 
-static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
-
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
   // Mode
   vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
@@ -1032,7 +992,7 @@
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        write_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
         total_size += 4;
       }
 
@@ -1187,7 +1147,7 @@
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
   else
-    encode_txfm_probs(cpi, &header_bc);
+    encode_txfm_probs(cm, &header_bc);
 
   update_coef_probs(cpi, &header_bc);
 
@@ -1195,7 +1155,7 @@
   active_section = 2;
 #endif
 
-  vp9_update_skip_probs(cm, &header_bc);
+  update_skip_probs(cm, &header_bc);
 
   if (!frame_is_intra_only(cm)) {
     int i;
@@ -1210,7 +1170,7 @@
     vp9_zero(cm->counts.inter_mode);
 
     if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cpi, &header_bc);
+      update_switchable_interp_probs(cm, &header_bc);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
@@ -1282,11 +1242,12 @@
     active_section = 7;
 #endif
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
-  vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 94bec8a..ddfd0ed 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -16,7 +16,11 @@
 extern "C" {
 #endif
 
-void vp9_update_skip_probs(VP9_COMMON *cm, vp9_writer *bc);
+struct VP9_COMP;
+
+void vp9_entropy_mode_init();
+
+void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 7cbdfce..85f6c97 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -49,7 +49,6 @@
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int rate;
   int distortion;
-  int64_t intra_error;
   int best_mode_index;
   int rddiv;
   int rdmult;
@@ -63,9 +62,6 @@
   // search loop
   int_mv pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
-
-  // Bit flag for each mode whether it has high error in comparison to others.
-  unsigned int modes_with_high_error;
 } PICK_MODE_CONTEXT;
 
 struct macroblock_plane {
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index a9d168c..d523239 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -47,7 +47,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -315,7 +315,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 73c1992..46072a2 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -94,7 +94,8 @@
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
+                                              MACROBLOCK *x,
                                               BLOCK_SIZE bs) {
   unsigned int var, sse;
   var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -102,6 +103,52 @@
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
+                                                   MACROBLOCK *x,
+                                                   int mi_row,
+                                                   int mi_col,
+                                                   BLOCK_SIZE bs) {
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  int offset = (mi_row * MI_SIZE) * yv12->y_stride + (mi_col * MI_SIZE);
+  unsigned int var, sse;
+  var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                           x->plane[0].src.stride,
+                           yv12->y_buffer + offset,
+                           yv12->y_stride,
+                           &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                      int mi_row,
+                                                      int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 4)
+    return BLOCK_64X64;
+  else if (var < 10)
+    return BLOCK_32X32;
+  else
+    return BLOCK_16X16;
+}
+
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int sse;
@@ -398,7 +445,6 @@
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
 
-  const int mb_mode_index = ctx->best_mode_index;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -442,7 +488,7 @@
 
     if ((cpi->oxcf.aq_mode == VARIANCE_AQ) ||
         (cpi->oxcf.aq_mode == COMPLEXITY_AQ)) {
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
   }
 
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
@@ -470,8 +516,8 @@
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC        /*DC_PRED*/,
       THR_V_PRED    /*V_PRED*/,
@@ -484,18 +530,19 @@
       THR_D63_PRED  /*D63_PRED*/,
       THR_TM        /*TM_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]++;
-#endif
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
@@ -553,8 +600,6 @@
   xd->mi_8x8 = cm->mi_grid_visible + idx_str;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = cm->mi + idx_str;
@@ -590,7 +635,7 @@
                                                  : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
 
     if (seg->enabled && cpi->seg0_cnt > 0 &&
         !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME) &&
@@ -629,7 +674,7 @@
   int orig_rdmult = x->rdmult;
   double rdmult_ratio;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
@@ -678,15 +723,15 @@
     }
 
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
   }
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     activity_masking(cpi, x);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    vp9_clear_system_state();  // __asm emms;
-    x->rdmult = round(x->rdmult * rdmult_ratio);
+    vp9_clear_system_state();
+    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     const int mi_offset = mi_row * cm->mi_cols + mi_col;
     unsigned char complexity = cpi->complexity_map[mi_offset];
@@ -715,8 +760,8 @@
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     x->rdmult = orig_rdmult;
     if (*totalrate != INT_MAX) {
-      vp9_clear_system_state();  // __asm emms;
-      *totalrate = round(*totalrate * rdmult_ratio);
+      vp9_clear_system_state();
+      *totalrate = (int)round(*totalrate * rdmult_ratio);
     }
   }
   else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
@@ -967,9 +1012,9 @@
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
-                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
+                             MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -996,7 +1041,7 @@
       for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
         int index = block_row * mis + block_col;
         // Find a partition size that fits
-        bsize = find_partition_size(cpi->sf.always_this_block_size,
+        bsize = find_partition_size(bsize,
                                     (row8x8_remaining - block_row),
                                     (col8x8_remaining - block_col), &bh, &bw);
         mi_8x8[index] = mi_upper_left + index;
@@ -1042,38 +1087,19 @@
   }
   return 0;
 }
+
 static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE bsize, int output_enabled) {
   int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-  const int mb_mode_index = ctx->best_mode_index;
-  int max_plane;
-
-  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
-  for (i = 0; i < max_plane; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
-
-  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][2];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
-    p[i].eobs = ctx->eobs_pbuf[i][2];
-  }
-
   x->skip = ctx->skip;
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC /*DC_PRED*/,
       THR_V_PRED /*V_PRED*/,
@@ -1087,16 +1113,18 @@
       THR_TM /*TM_PRED*/,
     };
     ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
-#endif
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
@@ -1129,8 +1157,8 @@
 }
 
 static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize) {
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
@@ -1148,7 +1176,6 @@
     ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
-
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -1199,7 +1226,7 @@
                    subsize);
       *get_sb_index(x, subsize) = 3;
       encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                subsize);
+                   subsize);
       break;
     default:
       assert("Invalid partition type.");
@@ -1231,13 +1258,14 @@
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT_MAX;
-  int split_rate = INT_MAX;
-  int64_t split_dist = INT_MAX;
+  int64_t last_part_dist = INT64_MAX;
+  int64_t last_part_rd = INT64_MAX;
   int none_rate = INT_MAX;
-  int64_t none_dist = INT_MAX;
+  int64_t none_dist = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
+  int64_t chosen_rd = INT64_MAX;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
@@ -1266,7 +1294,8 @@
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
-  if (cpi->sf.adjust_partitioning_from_last_frame) {
+  if (cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
@@ -1292,7 +1321,11 @@
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row, mi_col, bsize);
-      none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+      if (none_rate < INT_MAX) {
+        none_rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      }
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
@@ -1320,9 +1353,9 @@
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
 
@@ -1344,9 +1377,9 @@
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1372,9 +1405,9 @@
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
                          i != 3);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1387,16 +1420,19 @@
 
   pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX)
+  if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
+    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  }
 
   if (cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
       && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
       && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    split_rate = 0;
-    split_dist = 0;
+    chosen_rate = 0;
+    chosen_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
     // Split partition.
@@ -1423,46 +1459,44 @@
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT_MAX) {
-        split_rate = INT_MAX;
-        split_dist = INT_MAX;
+      if (rt == INT_MAX || dt == INT64_MAX) {
+        chosen_rate = INT_MAX;
+        chosen_dist = INT64_MAX;
         break;
       }
 
+      chosen_rate += rt;
+      chosen_dist += dt;
+
       if (i != 3)
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
-      split_rate += rt;
-      split_dist += dt;
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
-      split_rate += x->partition_cost[pl][PARTITION_NONE];
+      chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
-    if (split_rate < INT_MAX) {
-      split_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      chosen_rate = split_rate;
-      chosen_dist = split_dist;
+    if (chosen_rate < INT_MAX) {
+      chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
     }
   }
 
   // If last_part is better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
-      < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+  if (last_part_rd < chosen_rd) {
     mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = subsize;
     chosen_rate = last_part_rate;
     chosen_dist = last_part_dist;
+    chosen_rd = last_part_rd;
   }
   // If none was better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
-      > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+  if (none_rd < chosen_rd) {
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = bsize;
     chosen_rate = none_rate;
@@ -1474,7 +1508,7 @@
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
   if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
@@ -1932,14 +1966,14 @@
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
-    assert(best_dist < INT_MAX);
+    assert(best_dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
   int mi_col;
 
@@ -1955,28 +1989,45 @@
 
     BLOCK_SIZE i;
     MACROBLOCK *x = &cpi->mb;
-    for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[i];
-      const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
-      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
-        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
-          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
-            get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+
+    if (cpi->sf.adaptive_pred_interp_filter) {
+      for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+        const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+        for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
+          for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
+            for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
+              get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+      }
     }
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.use_lastframe_partitioning ||
-        cpi->sf.use_one_partition_size_always ) {
+    if ((cpi->sf.partition_search_type == SEARCH_PARTITION &&
+         cpi->sf.use_lastframe_partitioning) ||
+        cpi->sf.partition_search_type == FIXED_PARTITION ||
+        cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
       MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
       cpi->mb.source_variance = UINT_MAX;
-      if (cpi->sf.use_one_partition_size_always) {
+      if (cpi->sf.partition_search_type == FIXED_PARTITION) {
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                         cpi->sf.always_this_block_size);
+        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1);
+      } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+                 cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+        // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+        // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+        // map to the same thing.
+        BLOCK_SIZE bsize;
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
@@ -2272,78 +2323,62 @@
   return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
 }
 
-static void rtc_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO **mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon) {
+static void nonrd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int mis = cm->mode_info_stride;
-  int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size];
-  int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size];
+  int mis = cm->mode_info_stride;
+  int br, bc;
   int i, j;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
   MB_PREDICTION_MODE mode = DC_PRED;
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int b32i;
-  for (b32i = 0; b32i < 4; b32i++) {
-    int b16i;
-    for (b16i = 0; b16i < 4; b16i++) {
-      int b8i;
-      int block_row = get_block_row(b32i, b16i, 0);
-      int block_col = get_block_col(b32i, b16i, 0);
-      int index = block_row * mis + block_col;
-      int rate;
-      int64_t dist;
+  int rows = MIN(MI_BLOCK_SIZE, tile->mi_row_end - mi_row);
+  int cols = MIN(MI_BLOCK_SIZE, tile->mi_col_end - mi_col);
 
-      // Find a partition size that fits
-      bsize = find_partition_size(cpi->sf.always_this_block_size,
-                                  (row8x8_remaining - block_row),
-                                  (col8x8_remaining - block_col),
-                                  &mi_height, &mi_width);
-      mi_8x8[index] = mi_8x8[0] + index;
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+  int bh = num_8x8_blocks_high_lookup[bsize];
 
-      set_mi_row_col(xd, tile, mi_row + block_row, mi_height,
-                     mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols);
+  int brate = 0;
+  int64_t bdist = 0;
+  *rate = 0;
+  *dist = 0;
 
-      xd->mi_8x8 = mi_8x8 + index;
+  // find prediction mode for each 8x8 block
+  for (br = 0; br < rows; br += bh) {
+    for (bc = 0; bc < cols; bc += bw) {
+      int row = mi_row + br;
+      int col = mi_col + bc;
 
-      if (cm->frame_type != KEY_FRAME) {
-        set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize);
+      BLOCK_SIZE bs = find_partition_size(bsize, rows - br, cols - bc,
+                                          &bh, &bw);
+      set_offsets(cpi, tile, row, col, bs);
 
-        vp9_pick_inter_mode(cpi, x, tile,
-                            mi_row + block_row, mi_col + block_col,
-                            &rate, &dist, bsize);
-      } else {
-        set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
-                      mi_row + block_row, mi_col + block_col);
-      }
+      if (cm->frame_type != KEY_FRAME)
+        vp9_pick_inter_mode(cpi, x, tile, row, col, &brate, &bdist, bs);
+      else
+        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode, row, col);
 
-      for (j = 0; j < mi_height; j++)
-        for (i = 0; i < mi_width; i++)
-          if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i
-            && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) {
-            mi_8x8[index+ i + j * mis] = mi_8x8[index];
-          }
+      *rate += brate;
+      *dist += bdist;
 
-      for (b8i = 0; b8i < 4; b8i++) {
-      }
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i) {
+          xd->mi_8x8[j * mis + i] = xd->mi_8x8[0];
+        }
     }
   }
-  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 
   *rate = chosen_rate;
   *dist = chosen_dist;
+
+  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 }
 
-static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                              int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON * const cm = &cpi->common;
+static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                                int mi_row, TOKENEXTRA **tp) {
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -2356,14 +2391,25 @@
     int dummy_rate;
     int64_t dummy_dist;
 
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-
     cpi->mb.source_variance = UINT_MAX;
-    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
-    rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                     &dummy_rate, &dummy_dist, 1);
+
+    if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          cpi->sf.always_this_block_size,
+                          &dummy_rate, &dummy_dist);
+    } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+               cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+      // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+      // map to the same thing.
+      BLOCK_SIZE bsize = get_nonrd_var_based_fixed_partition(cpi,
+                                                             mi_row,
+                                                             mi_col);
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          bsize, &dummy_rate, &dummy_dist);
+    } else {
+      assert(0);
+    }
   }
 }
 // end RTC play code
@@ -2419,6 +2465,22 @@
 
   set_prev_mi(cm);
 
+  if (cpi->sf.use_nonrd_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+  }
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -2438,11 +2500,11 @@
           // For each row of SBs in the frame
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += 8) {
-            if (cpi->sf.use_pick_mode)
-              encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
+               mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
+            if (cpi->sf.use_nonrd_pick_mode)
+              encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
             else
-              encode_sb_row(cpi, &tile, mi_row, &tp);
+              encode_rd_sb_row(cpi, &tile, mi_row, &tp);
           }
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2702,9 +2764,10 @@
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
-                   !cpi->sf.use_pick_mode;
+                   !cpi->sf.use_nonrd_pick_mode;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -2741,6 +2804,7 @@
       vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
       sum_intra_stats(&cm->counts, mi);
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -2750,19 +2814,17 @@
       setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf);
     }
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
-  }
 
-  if (!is_inter_block(mbmi)) {
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else if (!x->skip) {
-    mbmi->skip = 1;
-    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else {
-    mbmi->skip = 1;
-    if (output_enabled)
-      cm->counts.skip[vp9_get_skip_context(xd)][1]++;
-    reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    if (!x->skip) {
+      mbmi->skip = 1;
+      vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+      vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    } else {
+      mbmi->skip = 1;
+      if (output_enabled)
+        cm->counts.skip[vp9_get_skip_context(xd)][1]++;
+      reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    }
   }
 
   if (output_enabled) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 8770107..513730e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -36,22 +36,22 @@
 };
 
 void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff_ptr, ptrdiff_t diff_stride,
-                          const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+      diff[c] = src[c] - pred[c];
 
-    diff_ptr += diff_stride;
-    pred_ptr += pred_stride;
-    src_ptr  += src_stride;
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
   }
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
@@ -62,22 +62,6 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  subtract_plane(x, bsize, 0);
-}
-
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  int i;
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    subtract_plane(x, bsize, i);
-}
-
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  vp9_subtract_sby(x, bsize);
-  vp9_subtract_sbuv(x, bsize);
-}
-
 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
@@ -121,19 +105,18 @@
   return pt;
 }
 
-static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE plane_bsize,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size) {
+static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, MACROBLOCK *mb,
+                       struct optimize_ctx *ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *qcoeff_ptr;
-  int16_t *dqcoeff_ptr;
+  const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int eob = p->eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
@@ -143,7 +126,6 @@
   PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
   const int default_eob = 16 << (tx_size << 1);
-
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int16_t *dequant_ptr = pd->dequant;
@@ -151,10 +133,13 @@
   const scan_order *so = get_scan(xd, tx_size, type, block);
   const int16_t *scan = so->scan;
   const int16_t *nb = so->neighbors;
+  ENTROPY_CONTEXT *a, *l;
+  int tx_x, tx_y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &tx_x, &tx_y);
+  a = &ctx->ta[plane][tx_x];
+  l = &ctx->tl[plane][tx_y];
 
   assert((!type && !plane) || (type && plane));
-  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -172,13 +157,13 @@
   next = eob;
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
-        qcoeff_ptr[scan[i]]].token];
+        qcoeff[scan[i]]].token];
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
 
     rc = scan[i];
-    x = qcoeff_ptr[rc];
+    x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
       int shortcut = 0;
@@ -203,7 +188,7 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -216,8 +201,8 @@
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
                                          dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -306,16 +291,16 @@
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
-  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
+  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
     if (x) {
       final_eob = i;
     }
     rc = scan[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -326,58 +311,39 @@
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
-                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  optimize_b(mb, plane, block, plane_bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, int16_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_fdct32x32(src, dst, src_stride);
 }
 
-static void optimize_init_b(int plane, BLOCK_SIZE bsize,
-                            struct encode_b_args *args) {
-  const MACROBLOCKD *xd = &args->x->e_mbd;
-  const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-
-  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
-}
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
-  uint16_t *eob = &p->eobs[block];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
-  int16_t *src_diff;
+  const int16_t *src_diff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan_order = &vp9_default_scan_orders[TX_32X32];
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-      else
-        vp9_fdct32x32(src_diff, coeff, diff_stride);
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
                            scan_order->iscan);
       break;
     case TX_16X16:
-      scan_order = &vp9_default_scan_orders[TX_16X16];
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -385,7 +351,6 @@
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      scan_order = &vp9_default_scan_orders[TX_8X8];
       vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -393,7 +358,6 @@
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      scan_order = &vp9_default_scan_orders[TX_4X4];
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -432,7 +396,7 @@
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+    optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
   } else {
     ctx->ta[plane][i] = p->eobs[block] > 0;
     ctx->tl[plane][j] = p->eobs[block] > 0;
@@ -466,8 +430,7 @@
 }
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
-  struct encode_b_args *const args = arg;
-  MACROBLOCK *const x = args->x;
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -479,24 +442,14 @@
 
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] == 0)
-    return;
-
-  xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  if (p->eobs[block] > 0)
+    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
-
-  vp9_subtract_sby(x, bsize);
-  if (x->optimize)
-    optimize_init_b(0, bsize, &arg);
-
-  vp9_foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1,
-                                         &arg);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp9_subtract_plane(x, bsize, 0);
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -504,17 +457,22 @@
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
 
-  if (!x->skip_recode)
-    vp9_subtract_sb(x, bsize);
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp9_subtract_plane(x, bsize, plane);
 
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      optimize_init_b(i, bsize, &arg);
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
+
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
   }
-
-  vp9_foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -536,14 +494,16 @@
   uint8_t *src, *dst;
   int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
   int i, j;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)];
-  src = &p->src.buf[4 * (j * p->src.stride + i)];
+  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
+  src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   // if (x->optimize)
-  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+  //   optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
@@ -551,22 +511,19 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(32, 32, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
-        if (x->use_lp32x32fdct)
-          vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-        else
-          vp9_fdct32x32(src_diff, coeff, diff_stride);
+                           src, src_stride, dst, dst_stride);
+        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, p->zbin_extra, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -574,11 +531,11 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -586,7 +543,7 @@
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -594,11 +551,11 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
@@ -606,24 +563,20 @@
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
-        mode = xd->mi_8x8[0]->bmi[block].as_mode;
-      else
-        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-
+      mode = plane == 0 ? get_y_mode(xd->mi_8x8[0], block) : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
 
       if (!x->skip_recode) {
         vp9_subtract_block(4, 4, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
@@ -639,9 +592,9 @@
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
+          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
-          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
     default:
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 515935f..dcf6e87 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -21,14 +21,12 @@
 #endif
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index af710a8..5079699 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -224,35 +224,29 @@
   }
 }
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
-  if (mvc_flag_v)
-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
-  if (mvc_flag_h)
-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+static void inc_mvs(const int_mv mv[2], const MV ref[2], int is_compound,
                     nmv_context_counts *counts) {
   int i;
   for (i = 0; i < 1 + is_compound; ++i) {
-    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
-                      mv[i].as_mv.col - ref[i].as_mv.col };
+    const MV diff = { mv[i].as_mv.row - ref[i].row,
+                      mv[i].as_mv.col - ref[i].col };
     vp9_inc_mv(&diff, counts);
   }
 }
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]) {
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int is_compound = has_second_ref(mbmi);
-  nmv_context_counts *counts = &cpi->common.counts.mv;
+  nmv_context_counts *counts = &cm->counts.mv;
 
   if (mbmi->sb_type < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index f0463bbd..f16b2c1 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -25,14 +25,11 @@
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index bf9dd3e..32ed969 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -65,7 +65,7 @@
 
   double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  for (i = 0; i < QINDEX_RANGE; ++i) {
     if (target_q <= vp9_convert_qindex_to_q(i)) {
       ret_val = i;
       break;
@@ -106,12 +106,12 @@
 }
 
 
-// Read frame stats at an offset from the current position
+// Read frame stats at an offset from the current position.
 static int read_frame_stats(const struct twopass_rc *p,
                             FIRSTPASS_STATS *frame_stats, int offset) {
   const FIRSTPASS_STATS *fps_ptr = p->stats_in;
 
-  // Check legality of offset
+  // Check legality of offset.
   if (offset >= 0) {
     if (&fps_ptr[offset] >= p->stats_in_end)
       return EOF;
@@ -144,7 +144,6 @@
 
 // TEMP debug code
 #if OUTPUT_FPF
-
   {
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
@@ -377,7 +376,6 @@
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const ref = xd->plane[0].pre[0].buf;
   const int ref_stride = xd->plane[0].pre[0].stride;
-
   unsigned int sse;
   vp9_variance_fn_t fn = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type);
   fn(src, src_stride, ref, ref_stride, &sse);
@@ -398,18 +396,18 @@
   int new_mv_mode_penalty = 256;
   const int quart_frm = MIN(cpi->common.width, cpi->common.height);
 
-  // refine the motion search range accroding to the frame dimension
-  // for first pass test
+  // Refine the motion search range according to the frame dimension
+  // for first pass test.
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
-    sr++;
+    ++sr;
 
   step_param += sr;
   further_steps -= sr;
 
-  // override the default variance function to use MSE
+  // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
 
-  // Initial step/diamond search centred on best mv
+  // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                     step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
@@ -424,15 +422,15 @@
     best_mv->col = tmp_mv.col;
   }
 
-  // Further step/diamond searches as necessary
+  // Carry out further step/diamond searches as necessary.
   n = num00;
   num00 = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
-      num00--;
+      --num00;
     } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
@@ -497,14 +495,14 @@
   struct twopass_rc *const twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
   setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  xd->mi_8x8[0] = cm->mi;  // required for vp9_frame_init_quantizer
+  xd->mi_8x8[0] = cm->mi;
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -521,34 +519,32 @@
   vp9_init_mv_probs(cm);
   vp9_initialize_rd_consts(cpi);
 
-  // tiling is ignored in the first pass
+  // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     int_mv best_ref_mv;
 
     best_ref_mv.as_int = 0;
 
-    // reset above block coeffs
+    // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
     // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders
+    // outside the UMV borders.
     x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + BORDER_MV_PIXELS_B16;
 
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
-      vp9_clear_system_state();  // __asm emms;
+      vp9_clear_system_state();
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -566,15 +562,15 @@
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
-      // do intra 16x16 prediction
+      // Do intra 16x16 prediction.
       this_error = vp9_encode_intra(x, use_dc_pred);
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_clear_system_state();  // __asm emms;
-        this_error *= error_weight;
+        vp9_clear_system_state();
+        this_error = (int)(this_error * error_weight);
       }
 
-      // intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (eg a plain black frame).
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
       // We do not have special cases in first pass for 0,0 and nearest etc so
       // all inter modes carry an overhead cost estimate for the mv.
       // When the error score is very low this causes us to pick all or lots of
@@ -582,7 +578,7 @@
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
-      // Cumulative intra error total
+      // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending
@@ -590,23 +586,23 @@
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-      // Other than for the first frame do a motion search
+      // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
         int tmp_err, motion_error;
         int_mv mv, tmp_mv;
 
         xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
         motion_error = zz_motion_search(cpi, x);
-        // Simple 0,0 motion with no mv overhead
+        // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
         // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search
+        // starting point (best reference) for the search.
         first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
                                  &motion_error);
         if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();  // __asm emms;
-          motion_error *= error_weight;
+          vp9_clear_system_state();
+          motion_error = (int)(motion_error * error_weight);
         }
 
         // If the current best reference mv is not centered on 0,0 then do a 0,0
@@ -616,8 +612,8 @@
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &tmp_err);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            tmp_err *= error_weight;
+            vp9_clear_system_state();
+            tmp_err = (int)(tmp_err * error_weight);
           }
 
           if (tmp_err < motion_error) {
@@ -626,9 +622,9 @@
           }
         }
 
-        // Experimental search in an older reference frame
+        // Search in an older reference frame.
         if (cm->current_video_frame > 1) {
-          // Simple 0,0 motion with no mv overhead
+          // Assume 0,0 motion with no mv overhead.
           int gf_motion_error;
 
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
@@ -637,22 +633,22 @@
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            gf_motion_error *= error_weight;
+            vp9_clear_system_state();
+            gf_motion_error = (int)(gf_motion_error * error_weight);
           }
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            second_ref_count++;
+            ++second_ref_count;
 
-          // Reset to last frame as reference buffer
+          // Reset to last frame as reference buffer.
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
           xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
           xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
 
-          // In accumulating a score for the older reference frame
-          // take the best of the motion predicted score and
-          // the intra coded error (just as will be done for)
-          // accumulation of "coded_error" for the last frame.
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
           if (gf_motion_error < this_error)
             sr_coded_error += gf_motion_error;
           else
@@ -660,17 +656,16 @@
         } else {
           sr_coded_error += motion_error;
         }
-        /* Intra assumed best */
+        // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
         if (motion_error <= this_error) {
-          // Keep a count of cases where the inter and intra were
-          // very close and very low. This helps with scene cut
-          // detection for example in cropped clips with black bars
-          // at the sides or top and bottom.
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
               this_error < 2 * intrapenalty)
-            neutral_count++;
+            ++neutral_count;
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
@@ -680,50 +675,49 @@
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby(x, bsize);
+          vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
           sum_mvc_abs += abs(mv.as_mv.col);
           sum_mvrs += mv.as_mv.row * mv.as_mv.row;
           sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
+          ++intercount;
 
           best_ref_mv.as_int = mv.as_int;
 
-          // Was the vector non-zero
           if (mv.as_int) {
-            mvcount++;
+            ++mvcount;
 
-            // Was it different from the last non zero vector
+            // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
+              ++new_mv_count;
             lastmv_as_int = mv.as_int;
 
-            // Does the Row vector point inwards or outwards
+            // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
 
-            // Does the Row vector point inwards or outwards
+            // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
           }
         }
@@ -732,7 +726,7 @@
       }
       coded_error += (int64_t)this_error;
 
-      // adjust to the next column of macroblocks
+      // Adjust to the next column of MBs.
       x->plane[0].src.buf += 16;
       x->plane[1].src.buf += uv_mb_height;
       x->plane[2].src.buf += uv_mb_height;
@@ -741,24 +735,24 @@
       recon_uvoffset += uv_mb_height;
     }
 
-    // adjust to the next row of mbs
+    // Adjust to the next row of MBs.
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
     x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
     x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   {
     FIRSTPASS_STATS fps;
 
     fps.frame = cm->current_video_frame;
-    fps.intra_error = intra_error >> 8;
-    fps.coded_error = coded_error >> 8;
-    fps.sr_coded_error = sr_coded_error >> 8;
+    fps.intra_error = (double)(intra_error >> 8);
+    fps.coded_error = (double)(coded_error >> 8);
+    fps.sr_coded_error = (double)(sr_coded_error >> 8);
     fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / cm->MBs;
@@ -792,14 +786,14 @@
     // cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
-    // don't want to do output stats with a stack variable!
+    // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
     output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also dont allow it to lag too far
+  // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
        (twopass->this_frame_stats.pcnt_inter > 0.20) &&
@@ -808,9 +802,9 @@
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     twopass->sr_update_lag = 1;
   } else {
-    twopass->sr_update_lag++;
+    ++twopass->sr_update_lag;
   }
-  // swap frame pointers so last frame refers to the frame we just compressed
+  // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
 
   vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
@@ -820,7 +814,7 @@
   if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
 
-  // use this to see what the first pass reconstruction looks like
+  // Use this to see what the first pass reconstruction looks like.
   if (0) {
     char filename[512];
     FILE *recon_file;
@@ -836,15 +830,11 @@
     fclose(recon_file);
   }
 
-  cm->current_video_frame++;
+  ++cm->current_video_frame;
 }
 
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
+// Estimate a cost per mb attributable to overheads such as the coding of modes
+// and motion vectors. This currently makes simplistic assumptions for testing.
 static double bitcost(double prob) {
   return -(log(prob) / log(2.0));
 }
@@ -867,18 +857,17 @@
   motion_cost = bitcost(av_pct_motion);
   intra_cost = bitcost(av_intra);
 
-  // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
+  // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
-  // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
+  // Produce a crude estimate of the overhead cost from modes. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
            (av_intra * intra_cost)) * cpi->common.MBs) << 9;
 
-  // return mv_cost + mode_cost;
   // TODO(paulwilkins): Fix overhead costs for extended Q range.
 #endif
   return 0;
@@ -895,7 +884,7 @@
   const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
                                 pt_high);
 
-  // Calculate correction factor
+  // Calculate correction factor.
   if (power_term < 1.0)
     assert(error_term >= 0.0);
 
@@ -921,7 +910,7 @@
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+  for (q = rc->best_quality; q < rc->worst_quality; ++q) {
     const double err_correction_factor = calc_correction_factor(err_per_mb,
                                              ERR_DIVISOR, 0.5, 0.90, q);
     const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
@@ -954,11 +943,11 @@
   twopass->total_stats = *twopass->stats_in_end;
   twopass->total_left_stats = twopass->total_stats;
 
-  // each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant.   The frame rate prior to the first frame
-  // encoded in the second pass is a guess.  However the sum duration is not.
-  // Its calculated based on the actual durations of all frames from the first
-  // pass.
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
   vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count /
                         twopass->total_stats.duration);
 
@@ -969,18 +958,18 @@
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF
+  // are still boosted appropriately for KF/GF/ARF.
   twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
   twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
-  // This variable monitors how far behind the second ref update is lagging
+  // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
   // Scan the first pass file and calculate an average Intra / Inter error score
   // ratio for the sequence.
   {
     double sum_iiratio = 0.0;
-    start_pos = twopass->stats_in;  // Note the starting "file" position.
+    start_pos = twopass->stats_in;
 
     while (input_stats(twopass, &this_frame) != EOF) {
       const double iiratio = this_frame.intra_error /
@@ -991,7 +980,6 @@
     twopass->avg_iiratio = sum_iiratio /
         DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count);
 
-    // Reset file position
     reset_fpf_position(twopass, start_pos);
   }
 
@@ -1001,7 +989,7 @@
     double av_error = twopass->total_stats.ssim_weighted_pred_err /
                       DOUBLE_DIVIDE_CHECK(twopass->total_stats.count);
 
-    start_pos = twopass->stats_in;  // Note starting "file" position
+    start_pos = twopass->stats_in;
 
     twopass->modified_error_total = 0.0;
     twopass->modified_error_min =
@@ -1022,8 +1010,8 @@
 void vp9_end_second_pass(VP9_COMP *cpi) {
 }
 
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
 static double get_prediction_decay_rate(const VP9_COMMON *cm,
                                         const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
@@ -1056,9 +1044,8 @@
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
 
-    // Look ahead a few frames to see if static condition
-    // persists...
-    for (j = 0; j < still_interval; j++) {
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
@@ -1068,7 +1055,7 @@
 
     reset_fpf_position(&cpi->twopass, position);
 
-    // Only if it does do we signal a transition to still
+    // Only if it does do we signal a transition to still.
     if (j == still_interval)
       trans_to_still = 1;
   }
@@ -1078,7 +1065,7 @@
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
-// reflect this
+// reflect this.
 static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
@@ -1091,7 +1078,7 @@
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
-    // comapred to pcnt_inter.
+    // compared to pcnt_inter.
     if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
         next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
@@ -1100,7 +1087,7 @@
   return flash_detected;
 }
 
-// Update the motion related elements to the GF arf boost calculation
+// Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(
   FIRSTPASS_STATS *this_frame,
   double *this_frame_mv_in_out,
@@ -1112,13 +1099,13 @@
   // Accumulate motion stats.
   motion_pct = this_frame->pcnt_motion;
 
-  // Accumulate Motion In/Out of frame stats
+  // Accumulate Motion In/Out of frame stats.
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
   *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is. (A ratio of absmv / mv)
+  // the motion field is (a ratio of absmv / mv).
   if (motion_pct > 0.05) {
     const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
@@ -1141,7 +1128,7 @@
                                double this_frame_mv_in_out) {
   double frame_boost;
 
-  // Underlying boost factor is based on inter intra error ratio
+  // Underlying boost factor is based on inter intra error ratio.
   if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
@@ -1149,13 +1136,12 @@
     frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
-  // Increase boost for frames where new data coming into frame
-  // (eg zoom out). Slightly reduce boost if there is a net balance
-  // of motion out of the frame (zoom in).
-  // The range for this_frame_mv_in_out is -1.0 to +1.0
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In extreme case boost is halved
+  // In the extreme case the boost is halved.
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
@@ -1177,12 +1163,12 @@
   int arf_boost;
   int flash_detected = 0;
 
-  // Search forward from the proposed arf/next gf position
-  for (i = 0; i < f_frames; i++) {
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1193,7 +1179,7 @@
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1206,7 +1192,7 @@
 
   *f_boost = (int)boost_score;
 
-  // Reset for backward looking loop
+  // Reset for backward looking loop.
   boost_score = 0.0;
   mv_ratio_accumulator = 0.0;
   decay_accumulator = 1.0;
@@ -1214,12 +1200,12 @@
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
 
-  // Search backward towards last gf position
-  for (i = -1; i >= -b_frames; i--) {
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1230,7 +1216,7 @@
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1280,8 +1266,7 @@
     return;
   }
 
-  // ARF Group: work out the ARF schedule.
-  // Mark ARF frames as negative.
+  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
   if (end < 0) {
     // printf("start:%d end:%d\n", -end, -end);
     // ARF frame is at the end of the range.
@@ -1404,14 +1389,14 @@
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
 
-  double loop_decay_rate = 1.00;          // Starting decay rate
+  double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  const int max_bits = frame_max_bits(cpi);     // Max for a single frame
+  const int max_bits = frame_max_bits(cpi);  // Max bits for a single frame.
 
   unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
                                cpi->oxcf.lag_in_frames;
@@ -1424,19 +1409,19 @@
 
   twopass->gf_group_bits = 0;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
 
-  // Note the error of the frame at the start of the group (this will be
-  // the GF frame error if we code a normal gf
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
   gf_first_frame_err = mod_frame_err;
 
   // If this is a key frame or the overlay from a previous arf then
-  // The error score / cost of this frame has already been accounted for.
+  // the error score / cost of this frame has already been accounted for.
   if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
@@ -1458,9 +1443,9 @@
 
   i = 0;
   while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    i++;    // Increment the loop counter
+    ++i;
 
-    // Accumulate error score of frames in this gf group
+    // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
@@ -1471,13 +1456,13 @@
     // quality back to an earlier frame is then restored.
     flash_detected = detect_flash(twopass, 0);
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
@@ -1490,8 +1475,8 @@
                                       next_frame.pcnt_motion;
       }
 
-      // Break clause to detect very still sections after motion
-      // (for example a static image after a fade or other transition).
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
@@ -1499,16 +1484,16 @@
       }
     }
 
-    // Calculate a boost number for this frame
+    // Calculate a boost number for this frame.
     boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static
+      // Break at cpi->max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
-        // Don't break out with a very short interval
+        // Don't break out with a very short interval.
         (i > MIN_GF_INTERVAL) &&
         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
         (!flash_detected) &&
@@ -1527,10 +1512,10 @@
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
-  // Don't allow a gf too near the next kf
+  // Don't allow a gf too near the next kf.
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
-      i++;
+      ++i;
 
       if (EOF == input_stats(twopass, this_frame))
         break;
@@ -1560,14 +1545,14 @@
   else
     rc->baseline_gf_interval = i;
 
-  // Should we use the alternate reference frame
+  // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
-      // for real scene cuts (not forced kfs) dont allow arf very near kf.
+      // For real scene cuts (not forced kfs) don't allow arf very near kf.
       (rc->next_key_frame_forced ||
       (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
-    // Alternative boost calculation for alt ref
+    // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
                                    &b_boost);
     rc->source_alt_ref_pending = 1;
@@ -1629,7 +1614,7 @@
 #endif
 #endif
 
-  // Calculate the bits to be allocated to the group as a whole
+  // Calculate the bits to be allocated to the group as a whole.
   if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
     twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
@@ -1641,11 +1626,11 @@
      twopass->kf_group_bits : twopass->gf_group_bits;
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
+  // variability limit, cpi->oxcf.two_pass_vbrmax_section.
   if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
     twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
-  // Reset the file position
+  // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
   // Assign  bits to the arf or gf.
@@ -1657,7 +1642,7 @@
 
     int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
-    // Set max and minimum boost and hence minimum allocation
+    // Set max and minimum boost and hence minimum allocation.
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
     if (rc->source_alt_ref_pending && i == 0)
@@ -1665,7 +1650,7 @@
     else
       allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
 
-    // Prevent overflow
+    // Prevent overflow.
     if (boost > 1023) {
       int divisor = boost >> 10;
       boost /= divisor;
@@ -1673,13 +1658,13 @@
     }
 
     // Calculate the number of bits to be spent on the gf or arf based on
-    // the boost number
+    // the boost number.
     gf_bits = (int)((double)boost * (twopass->gf_group_bits /
                   (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
-    // based on the error score of the frame itself
+    // based on the error score of the frame itself.
     if (rc->baseline_gf_interval < 1 ||
         mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
       double alt_gf_grp_bits = (double)twopass->kf_group_bits  *
@@ -1703,7 +1688,7 @@
         gf_bits = alt_gf_bits;
     }
 
-    // Dont allow a negative value for gf_bits
+    // Don't allow a negative value for gf_bits.
     if (gf_bits < 0)
       gf_bits = 0;
 
@@ -1713,27 +1698,27 @@
     if (i == 1 ||
         (!rc->source_alt_ref_pending &&
          cpi->common.frame_type != KEY_FRAME)) {
-      // Per frame bit target for this frame
+      // Calculate the per frame bit target for this frame.
       vp9_rc_set_frame_target(cpi, gf_bits);
     }
   }
 
   {
-    // Adjust KF group bits and error remaining
+    // Adjust KF group bits and error remaining.
     twopass->kf_group_error_left -= (int64_t)gf_group_err;
     twopass->kf_group_bits -= twopass->gf_group_bits;
 
     if (twopass->kf_group_bits < 0)
       twopass->kf_group_bits = 0;
 
-    // If this is an arf update we want to remove the score for the
-    // overlay frame at the end which will usually be very cheap to code.
-    // The overlay frame has already in effect been coded so we want to spread
-    // the remaining bits amoung the other frames/
+    // If this is an arf update we want to remove the score for the overlay
+    // frame at the end which will usually be very cheap to code.
+    // The overlay frame has already, in effect, been coded so we want to spread
+    // the remaining bits among the other frames.
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
     if (rc->source_alt_ref_pending) {
-      twopass->gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
+      twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
     } else if (cpi->common.frame_type != KEY_FRAME) {
       twopass->gf_group_error_left = (int64_t)(gf_group_err
                                                    - gf_first_frame_err);
@@ -1747,7 +1732,7 @@
       twopass->gf_group_bits = 0;
 
     // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
+    // despite MIN_GF_INTERVAL and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (rc->baseline_gf_interval >= 3) {
       const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
@@ -1767,7 +1752,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_pos);
 
-    for (i = 0; i < rc->baseline_gf_interval; i++) {
+    for (i = 0; i < rc->baseline_gf_interval; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -1823,20 +1808,18 @@
                              const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
 
-  // Does the frame satisfy the primary criteria of a key frame
-  //      If so, then examine how well it predicts subsequent frames
+  // Does the frame satisfy the primary criteria of a key frame?
+  // If so, then examine how well it predicts subsequent frames.
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
@@ -1850,37 +1833,34 @@
 
     local_next_frame = *next_frame;
 
-    // Note the starting file position so we can reset to it
+    // Note the starting file position so we can reset to it.
     start_pos = cpi->twopass.stats_in;
 
-    // Examine how well the key frame predicts subsequent frames
-    for (i = 0; i < 16; i++) {
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
       double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
 
-      // Cumulative effect of decay in prediction quality
+      // Cumulative effect of decay in prediction quality.
       if (local_next_frame.pcnt_inter > 0.85)
         decay_accumulator *= local_next_frame.pcnt_inter;
       else
         decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-      // Keep a running total
+      // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
-      // Test various breakout clauses
+      // Test various breakout clauses.
       if ((local_next_frame.pcnt_inter < 0.05) ||
           (next_iiratio < 1.5) ||
           (((local_next_frame.pcnt_inter -
              local_next_frame.pcnt_neutral) < 0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)
-         ) {
+          (local_next_frame.intra_error < 200)) {
         break;
       }
 
@@ -1927,23 +1907,23 @@
 
   vp9_zero(next_frame);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_position = twopass->stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
-  // is this a forced key frame by interval
+  // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
-  // Clear the alt ref active flag as this can never be active on a key frame
+  // Clear the alt ref active flag as this can never be active on a key frame.
   rc->source_alt_ref_active = 0;
 
-  // Kf is always a gf so clear frames till next gf counter
+  // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
   rc->frames_to_key = 1;
 
-  // Take a copy of the initial frame details
+  // Take a copy of the initial frame details.
   first_frame = *this_frame;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
@@ -1951,75 +1931,74 @@
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
 
-  // find the next keyframe
+  // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // load a the next frame's stats
+    // Load the next frame's stats.
     last_frame = *this_frame;
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
     if (cpi->oxcf.auto_key &&
         lookup_next_frame_stats(twopass, &next_frame) != EOF) {
-      // Normal scene cut check
+      // Check for a scene cut.
       if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
 
-
-      // How fast is prediction quality decaying
+      // How fast is the prediction quality decaying?
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concened with decay in prediction
+      // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++)
+      for (j = 0; j < 8; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
-      // to a static scene.
+      // static scene.
       if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
-      // Step on to the next frame
-      rc->frames_to_key++;
+      // Step on to the next frame.
+      ++rc->frames_to_key;
 
       // If we don't have a real key frame within the next two
-      // forcekeyframeevery intervals then break out of the loop.
+      // key_frame_frequency intervals then break out of the loop.
       if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
     } else {
-      rc->frames_to_key++;
+      ++rc->frames_to_key;
     }
-    i++;
+    ++i;
   }
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural
-  // interval is between 1x and 2x
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
   if (cpi->oxcf.auto_key &&
       rc->frames_to_key > (int)cpi->key_frame_frequency) {
     FIRSTPASS_STATS tmp_frame;
 
     rc->frames_to_key /= 2;
 
-    // Copy first frame details
+    // Copy first frame details.
     tmp_frame = first_frame;
 
-    // Reset to the start of the group
+    // Reset to the start of the group.
     reset_fpf_position(twopass, start_position);
 
     kf_group_err = 0;
 
-    // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < rc->frames_to_key; i++) {
-      // Accumulate kf group errors
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      // Accumulate kf group errors.
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
 
       // Load the next frame's stats.
@@ -2032,22 +2011,22 @@
     rc->next_key_frame_forced = 0;
   }
 
-  // Special case for the last key frame of the file
+  // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
   if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
-    // Max for a single normal frame (not key frame)
+    // Maximum number of bits for a single normal frame (not key frame).
     int max_bits = frame_max_bits(cpi);
 
-    // Maximum bits for the kf group
+    // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
     // Default allocation based on bits left and relative
-    // complexity of the section
+    // complexity of the section.
     twopass->kf_group_bits = (int64_t)(twopass->bits_left *
        (kf_group_err / twopass->modified_error_left));
 
@@ -2058,7 +2037,7 @@
   } else {
     twopass->kf_group_bits = 0;
   }
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
   // Determine how big to make this keyframe based on how well the subsequent
@@ -2067,7 +2046,7 @@
   boost_score = 0.0;
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < rc->frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; ++i) {
     double r;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -2092,7 +2071,7 @@
       if (r > RMAX)
         r = RMAX;
 
-      // How fast is prediction quality decaying
+      // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
         decay_accumulator *= loop_decay_rate;
@@ -2110,7 +2089,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_position);
 
-    for (i = 0; i < rc->frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -2121,10 +2100,10 @@
         DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
-  // Work out how many bits to allocate for the key frame itself
+  // Work out how many bits to allocate for the key frame itself.
   if (1) {
     int kf_boost = (int)boost_score;
     int allocation_chunks;
@@ -2141,25 +2120,26 @@
     rc->kf_boost = kf_boost;
     twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
-    // We do three calculations for kf size.
-    // The first is based on the error score for the whole kf group.
-    // The second (optionally) on the key frames own error if this is
-    // smaller than the average for the group.
-    // The final one insures that the frame receives at least the
-    // allocation it would have received based on its own error score vs
-    // the error score remaining
-    // Special case if the sequence appears almost totaly static
-    // In this case we want to spend almost all of the bits on the
-    // key frame.
-    // cpi->rc.frames_to_key-1 because key frame itself is taken
-    // care of by kf_boost.
+    // Key frame size depends on:
+    // (1) the error score for the whole key frame group,
+    // (2) the key frames' own error if this is smaller than the
+    //     average for the group (optional),
+    // (3) insuring that the frame receives at least the allocation it would
+    //     have received based on its own error score vs the error score
+    //     remaining.
+    // Special case:
+    // If the sequence appears almost totally static we want to spend almost
+    // all of the bits on the key frame.
+    //
+    // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is
+    // taken care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
       allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
       allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
-    // Prevent overflow
+    // Prevent overflow.
     if (kf_boost > 1028) {
       int divisor = kf_boost >> 10;
       kf_boost /= divisor;
@@ -2169,7 +2149,7 @@
     twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0
            : twopass->kf_group_bits;
 
-    // Calculate the number of bits to be spent on the key frame
+    // Calculate the number of bits to be spent on the key frame.
     twopass->kf_bits = (int)((double)kf_boost *
         ((double)twopass->kf_group_bits / allocation_chunks));
 
@@ -2188,9 +2168,9 @@
       if (twopass->kf_bits > alt_kf_bits)
         twopass->kf_bits = alt_kf_bits;
     } else {
-    // Else if it is much harder than other frames in the group make sure
-    // it at least receives an allocation in keeping with its relative
-    // error score
+      // Else if it is much harder than other frames in the group make sure
+      // it at least receives an allocation in keeping with its relative
+      // error score.
       alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
 
@@ -2203,7 +2183,7 @@
     vp9_rc_set_frame_target(cpi, twopass->kf_bits);
   }
 
-  // Note the total error score of the kf group minus the key frame itself
+  // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
@@ -2221,7 +2201,7 @@
   } else {
     cm->frame_type = INTER_FRAME;
   }
-  // Do not use periodic key frames
+  // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 }
 
@@ -2260,13 +2240,6 @@
     twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    // adjust_maxq_qrange(cpi);
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
@@ -2275,19 +2248,19 @@
   this_frame_intra_error = this_frame.intra_error;
   this_frame_coded_error = this_frame.coded_error;
 
-  // keyframe and section processing !
+  // Keyframe and section processing.
   if (rc->frames_to_key == 0 ||
       (cm->frame_flags & FRAMEFLAGS_KEY)) {
-    // Define next KF group and assign bits to it
+    // Define next KF group and assign bits to it.
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
     cm->frame_type = INTER_FRAME;
   }
 
-  // Is this a GF / ARF (Note that a KF is always also a GF)
+  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
   if (rc->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it
+    // Define next gf group and assign bits to it.
     this_frame_copy = this_frame;
 
 #if CONFIG_MULTIPLE_ARF
@@ -2302,7 +2275,8 @@
 
     if (twopass->gf_zeromotion_pct > 995) {
       // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for show frame, i.e. set allow_encode_breakout to 2.
+      // to enable it for show frame, i.e. set allow_encode_breakout to
+      // ENCODE_BREAKOUT_LIMITED.
       if (!cm->show_frame)
         cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
       else
@@ -2312,8 +2286,8 @@
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
   } else {
-    // Otherwise this is an ordinary frame
-    // Assign bits from those allocated to the GF group
+    // Otherwise this is an ordinary frame.
+    // Assign bits from those allocated to the GF group.
     this_frame_copy =  this_frame;
     assign_std_frame_bits(cpi, &this_frame_copy);
   }
@@ -2335,7 +2309,7 @@
     target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
   vp9_rc_set_frame_target(cpi, target);
 
-  // Update the total stats remaining structure
+  // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
@@ -2345,7 +2319,7 @@
 #else
   cpi->twopass.bits_left -= 8 * bytes_used;
   // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
+  // undershoot on these frames.
   if (cm->frame_type == KEY_FRAME) {
     cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
         cpi->rc.projected_frame_size;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 7eacda2..44c1f90 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -419,7 +419,7 @@
                                golden_ref, cpi->Source);
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   separate_arf_mbs(cpi);
 }
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 62b33e4..7d6fd3b 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -855,6 +855,184 @@
                             square_num_candidates, square_candidates);
 };
 
+// Number of candidates in first hex search
+#define FIRST_HEX_CANDIDATES 6
+// Index of previous hex search's best match
+#define PRE_BEST_CANDIDATE 6
+// Number of candidates in following hex search
+#define NEXT_HEX_CANDIDATES 3
+// Number of candidates in refining search
+#define REFINE_CANDIDATES 4
+
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  static const MV hex[FIRST_HEX_CANDIDATES] = {
+    { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}
+  };
+  static const MV next_chkpts[PRE_BEST_CANDIDATE][NEXT_HEX_CANDIDATES] = {
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+  };
+  static const MV neighbors[REFINE_CANDIDATES] = {
+      {0, -1}, { -1, 0}, {1, 0}, {0, 1}
+  };
+  int i, j;
+
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  int br, bc;
+  MV this_mv;
+  unsigned int bestsad = 0x7fffffff;
+  unsigned int thissad;
+  const uint8_t *base_offset;
+  const uint8_t *this_offset;
+  int k = -1;
+  int best_site = -1;
+  const int max_hex_search = 512;
+  const int max_dia_search = 32;
+
+  const int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+
+  // Adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+
+  // Check the start point
+  base_offset = xd->plane[0].pre[0].buf;
+  this_offset = base_offset + (br * in_what_stride) + bc;
+  this_mv.row = br;
+  this_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                             sad_per_bit);
+
+  // Initial 6-point hex search
+  if (check_bounds(x, br, bc, 2)) {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  } else {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      if (!is_mv_in(x, &this_mv))
+        continue;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  }
+
+  // Continue hex search if we find a better match in first round
+  if (best_site != -1) {
+    br += hex[best_site].row;
+    bc += hex[best_site].col;
+    k = best_site;
+
+    // Allow search covering maximum MV range
+    for (j = 1; j < max_hex_search; j++) {
+      best_site = -1;
+
+      if (check_bounds(x, br, bc, 2)) {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      }
+
+      if (best_site == -1) {
+        break;
+      } else {
+        br += next_chkpts[k][best_site].row;
+        bc += next_chkpts[k][best_site].col;
+        k += 5 + best_site;
+        if (k >= 12) k -= 12;
+        else if (k >= 6) k -= 6;
+      }
+    }
+  }
+
+  // Check 4 1-away neighbors
+  for (j = 0; j < max_dia_search; j++) {
+    best_site = -1;
+
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    } else {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        if (!is_mv_in(x, &this_mv))
+          continue;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      br += neighbors[best_site].row;
+      bc += neighbors[best_site].col;
+    }
+  }
+
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  return bestsad;
+}
+
 #undef CHECK_BETTER
 
 int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
@@ -871,10 +1049,10 @@
 
   MV this_mv;
 
-  int bestsad = INT_MAX;
+  unsigned int bestsad = INT_MAX;
   int ref_row, ref_col;
 
-  int thissad;
+  unsigned int thissad;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
 
   const int *mvjsadcost = x->nmvjointsadcost;
@@ -1289,62 +1467,57 @@
 
 int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
-                           int sadpb, int further_steps,
-                           int do_refine,
+                           int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv) {
-  int_mv temp_mv;
-  int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
-                                        step_param, sadpb, &num00,
+                           const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
                                         fn_ptr, x->nmvjointcost,
                                         x->mvcost, ref_mv);
-  dst_mv->as_int = temp_mv.as_int;
+  *dst_mv = temp_mv;
 
-  n = num00;
-  num00 = 0;
-
-  /* If there won't be more n-step search, check to see if refining search is
-   * needed. */
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
+      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
                                         ref_mv);
 
-      /* check to see if refining search is needed. */
-      if (num00 > (further_steps - n))
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
         do_refine = 0;
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        dst_mv->as_int = temp_mv.as_int;
+        *dst_mv = temp_mv;
       }
     }
   }
 
-  /* final 1-away diamond refining search */
-  if (do_refine == 1) {
-    int search_range = 8;
-    int_mv best_mv;
-    best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, &best_mv.as_mv, sadpb, search_range,
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
                                        ref_mv);
-
     if (thissme < bestsme) {
       bestsme = thissme;
-      dst_mv->as_int = best_mv.as_int;
+      *dst_mv = best_mv;
     }
   }
+
   return bestsme;
 }
 
@@ -1352,7 +1525,7 @@
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int block) {
+                          const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
@@ -1371,7 +1544,6 @@
   int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride,
                              0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
-  MV *best_mv = &xd->mi_8x8[0]->bmi[block].as_mv[0].as_mv;
   *best_mv = *ref_mv;
 
   for (r = row_min; r < row_max; ++r) {
@@ -1405,13 +1577,12 @@
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1511,13 +1682,12 @@
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1657,11 +1827,7 @@
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
                                              ref_mv->col];
-  unsigned int thissad;
-
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  MV this_mv;
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
@@ -1673,15 +1839,13 @@
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_mv.row = ref_mv->row + neighbors[j].row;
-      this_mv.col = ref_mv->col + neighbors[j].col;
-
+      const MV this_mv = {ref_mv->row + neighbors[j].row,
+                          ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &this_mv)) {
         const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
                                                 this_mv.col];
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
+        unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                           in_what_stride, bestsad);
         if (thissad < bestsad) {
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
@@ -1703,15 +1867,15 @@
     }
   }
 
-  this_mv.row = ref_mv->row * 8;
-  this_mv.col = ref_mv->col * 8;
-
-  if (bestsad < INT_MAX)
+  if (bestsad < INT_MAX) {
+    unsigned int unused;
+    const MV mv = {ref_mv->row * 8, ref_mv->col * 8};
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
-  else
+                      &unused) +
+        mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
+  } else {
     return INT_MAX;
+  }
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1d6abe..586a74c 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -46,7 +46,7 @@
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv);
+                           const MV *ref_mv, MV *dst_mv);
 
 int vp9_hex_search(const MACROBLOCK *x,
                    MV *ref_mv,
@@ -75,6 +75,14 @@
                       int use_mvcost,
                       const MV *center_mv,
                       MV *best_mv);
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,
@@ -111,7 +119,7 @@
                                     int distance,
                                     const vp9_variance_fn_ptr_t *fn_ptr,
                                     int *mvjcost, int *mvcost[2],
-                                    const MV *center_mv, int n);
+                                    const MV *center_mv, MV *best_mv);
 
 typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
                                         MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index cef7e04..3921ea8 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -14,6 +14,8 @@
 
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
@@ -25,12 +27,12 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -38,9 +40,6 @@
 #include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_resize.h"
 
-#include "vpx_ports/vpx_timer.h"
-
-void vp9_entropy_mode_init();
 void vp9_coef_tree_initialize();
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -154,20 +153,22 @@
 }
 
 static void dealloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
-  cpi->segmentation_map = 0;
-  vpx_free(cpi->common.last_frame_seg_map);
-  cpi->common.last_frame_seg_map = 0;
+  cpi->segmentation_map = NULL;
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = NULL;
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = 0;
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
 
   vpx_free(cpi->complexity_map);
   cpi->complexity_map = 0;
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
-  vp9_free_frame_buffers(&cpi->common);
+  vp9_free_frame_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
   vp9_free_frame_buffer(&cpi->scaled_source);
@@ -194,19 +195,20 @@
 // to a target value
 // target q value
 int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
   int i;
-  int start_index = cpi->rc.worst_quality;
-  int target_index = cpi->rc.worst_quality;
 
   // Convert the average q value to an index.
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     start_index = i;
     if (vp9_convert_qindex_to_q(i) >= qstart)
       break;
   }
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
     if (vp9_convert_qindex_to_q(i) >= qtarget)
       break;
@@ -218,28 +220,23 @@
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to thegiven rate ratio.
 
-int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
-                               double base_q_index, double rate_target_ratio) {
+static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
+                                  double rate_target_ratio) {
   int i;
-  int base_bits_per_mb;
-  int target_bits_per_mb;
   int target_index = cpi->rc.worst_quality;
 
-  // Make SURE use of floating point in this function is safe.
-  vp9_clear_system_state();
-
   // Look up the current projected bits per block for the base index
-  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
-                                        base_q_index, 1.0);
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                            base_q_index, 1.0);
 
   // Find the target bits per mb based on the base value and given ratio.
-  target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; ++i) {
     target_index = i;
-    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
-                           i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type, i, 1.0) <=
+            target_bits_per_mb )
       break;
   }
 
@@ -249,11 +246,8 @@
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.
 static void setup_in_frame_q_adj(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
-  // double q_ratio;
-  int segment;
-  int qindex_delta;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   // Make SURE use of floating point in this function is safe.
   vp9_clear_system_state();
@@ -261,6 +255,8 @@
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+
     // Clear down the segment map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
@@ -278,8 +274,7 @@
 
     // Use some of the segments for in frame Q adjustment
     for (segment = 1; segment < 2; segment++) {
-      qindex_delta =
-        vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+      const int qindex_delta = compute_qdelta_by_rate(cpi, cm->base_qindex,
                                    in_frame_q_adj_ratio[segment]);
       vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
       vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
@@ -287,8 +282,8 @@
   }
 }
 static void configure_static_seg_features(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   int high_q = (int)(cpi->rc.avg_q > 48.0);
   int qi_delta;
@@ -432,13 +427,13 @@
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
   int row, col;
-  MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    mi_8x8 = mi_8x8_ptr;
-    cache = cache_ptr;
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
       cache[0] = mi_8x8[0]->mbmi.segment_id;
     mi_8x8_ptr += cm->mode_info_stride;
@@ -653,6 +648,7 @@
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 32;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -688,6 +684,7 @@
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 64;
     sf->disable_filter_search_var_thresh = 200;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -705,9 +702,9 @@
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
   }
-  if (speed == 5) {
+  if (speed >= 5) {
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->use_one_partition_size_always = 1;
+    sf->partition_search_type = FIXED_PARTITION;
     sf->always_this_block_size = BLOCK_16X16;
     sf->tx_size_search_method = frame_is_intra_only(cm) ?
       USE_FULL_RD : USE_LARGESTALL;
@@ -842,6 +839,9 @@
     sf->adaptive_rd_thresh = 5;
     sf->auto_min_max_partition_size = frame_is_intra_only(cm) ?
         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type == KEY_FRAME || (0 ==
+        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
@@ -849,11 +849,14 @@
     }
     sf->frame_parameter_update = 0;
     sf->encode_breakout_thresh = 1000;
+    sf->search_method = FAST_HEX;
   }
   if (speed >= 6) {
-    sf->always_this_block_size = BLOCK_16X16;
-    sf->use_pick_mode = 1;
-    sf->encode_breakout_thresh = 1000;
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+  }
+  if (speed >= 7) {
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
   }
 }
 
@@ -867,8 +870,10 @@
   if (speed < 0)
     speed = -speed;
 
+#if CONFIG_INTERNAL_STATS
   for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
+#endif
 
   // best quality defaults
   sf->frame_parameter_update = 1;
@@ -889,7 +894,7 @@
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->reference_masking = 0;
-  sf->use_one_partition_size_always = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
@@ -911,7 +916,7 @@
   sf->use_fast_lpf_pick = 0;
   sf->use_fast_coef_updates = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-  sf->use_pick_mode = 0;
+  sf->use_nonrd_pick_mode = 0;
   sf->encode_breakout_thresh = 0;
 
   switch (cpi->oxcf.mode) {
@@ -960,20 +965,24 @@
   if (cpi->encode_breakout && cpi->oxcf.mode == MODE_REALTIME &&
       sf->encode_breakout_thresh > cpi->encode_breakout)
     cpi->encode_breakout = sf->encode_breakout_thresh;
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
+    sf->adaptive_pred_interp_filter = 0;
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
 
-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
                                       cm->subsampling_x, cm->subsampling_y,
-                                      cpi->oxcf.lag_in_frames);
+                                      oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                               cpi->oxcf.width, cpi->oxcf.height,
+                               oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -1100,7 +1109,7 @@
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   VP9_COMMON *const cm = &cpi->common;
-  int64_t vbr_max_bits;
+  int vbr_max_bits;
 
   if (framerate < 0.1)
     framerate = 30;
@@ -1124,10 +1133,10 @@
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   //
-  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
-                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth *
+      cpi->oxcf.two_pass_vbrmax_section) / 100);
   cpi->rc.max_frame_bandwidth =
-    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
@@ -1148,7 +1157,7 @@
     cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
-static int64_t rescale(int val, int64_t num, int denom) {
+static int64_t rescale(int64_t val, int64_t num, int denom) {
   int64_t llnum = num;
   int64_t llden = denom;
   int64_t llval = val;
@@ -1201,9 +1210,12 @@
     lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
     bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
     // Update buffer-related quantities.
-    lc->starting_buffer_level = oxcf->starting_buffer_level * bitrate_alloc;
-    lc->optimal_buffer_level = oxcf->optimal_buffer_level * bitrate_alloc;
-    lc->maximum_buffer_size = oxcf->maximum_buffer_size * bitrate_alloc;
+    lc->starting_buffer_level =
+        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+    lc->optimal_buffer_level =
+        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+    lc->maximum_buffer_size =
+        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
     lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
     // Update framerate-related quantities.
@@ -1235,8 +1247,8 @@
     int prev_layer_target_bandwidth =
         oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
     lc->avg_frame_size =
-        (int)(lc->target_bandwidth - prev_layer_target_bandwidth) /
-        (lc->framerate - prev_layer_framerate);
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -1264,7 +1276,7 @@
   int temporal_layer = cpi->svc.temporal_layer_id;
   LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
   lc->rc = cpi->rc;
-  lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
   lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
   lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
   lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
@@ -1365,6 +1377,9 @@
 
   cpi->oxcf = *oxcf;
 
+  if (cpi->oxcf.cpu_used == -6)
+    cpi->oxcf.play_alternate = 0;
+
   switch (cpi->oxcf.mode) {
       // Real time and one pass deprecated in test code base
     case MODE_GOODQUALITY:
@@ -1481,18 +1496,14 @@
 
   if (cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+    update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth);
   }
 
-  cpi->speed = cpi->oxcf.cpu_used;
+  cpi->speed = abs(cpi->oxcf.cpu_used);
 
-  if (cpi->oxcf.lag_in_frames == 0) {
-    // Force allow_lag to 0 if lag_in_frames is 0.
-    cpi->oxcf.allow_lag = 0;
-  } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) {
-     // Limit on lag buffers as these are not currently dynamically allocated.
+  // Limit on lag buffers as these are not currently dynamically allocated.
+  if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
     cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-  }
 
 #if CONFIG_MULTIPLE_ARF
   vp9_zero(cpi->alt_ref_source);
@@ -1558,6 +1569,7 @@
   int num_pix = num_4x4_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_4x4_blk;
+
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1601,7 +1613,6 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x  = &cpi->mb;
 
-
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -2019,10 +2030,12 @@
                   / time_encoded;
 
       if (cpi->b_calculate_psnr) {
-        const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
-                                               cpi->total_sq_error);
-        const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
-                                                cpi->totalp_sq_error);
+        const double total_psnr =
+            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+                            (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+                            (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -2198,12 +2211,12 @@
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const double sse = calc_plane_error(a_planes[i], a_strides[i],
-                                        b_planes[i], b_strides[i],
-                                        w, h);
+    const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
+                                          b_planes[i], b_strides[i],
+                                          w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -2211,7 +2224,8 @@
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+                                  (double)total_sse);
 }
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2724,7 +2738,7 @@
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
   int recon_err;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2768,8 +2782,6 @@
 
     for (i = 0; i < MAX_MODES; ++i)
       fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-    for (i = 0; i < MAX_REFS; ++i)
-      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
 
     fprintf(fmodes, "\n");
 
@@ -2783,7 +2795,7 @@
                                        uint8_t *dest,
                                        int q) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   vp9_set_quantizer(cpi, q);
 
   // Set up entropy context depending on frame type. The decoder mandates
@@ -2794,7 +2806,7 @@
   if (cm->frame_type == KEY_FRAME) {
     vp9_setup_key_frame(cpi);
   } else {
-    if (!cm->intra_only && !cm->error_resilient_mode) {
+    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
       cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
     }
     vp9_setup_inter_frame(cpi);
@@ -2812,7 +2824,7 @@
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 }
 
 static void encode_with_recode_loop(VP9_COMP *cpi,
@@ -2822,6 +2834,7 @@
                                     int bottom_index,
                                     int top_index) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   int loop_count = 0;
   int loop = 0;
   int overshoot_seen = 0;
@@ -2831,12 +2844,12 @@
   int frame_under_shoot_limit;
 
   // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
                                    &frame_under_shoot_limit,
                                    &frame_over_shoot_limit);
 
   do {
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
     vp9_set_quantizer(cpi, q);
 
@@ -2849,7 +2862,7 @@
       if (cm->frame_type == KEY_FRAME) {
         vp9_setup_key_frame(cpi);
       } else {
-        if (!cm->intra_only && !cm->error_resilient_mode) {
+        if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
           cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         }
         vp9_setup_inter_frame(cpi);
@@ -2871,7 +2884,7 @@
     // seen in the last encoder iteration.
     // update_base_skip_probs(cpi);
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
@@ -2879,10 +2892,10 @@
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       vp9_save_coding_context(cpi);
       cpi->dummy_packing = 1;
-      if (!cpi->sf.use_pick_mode)
+      if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
-      cpi->rc.projected_frame_size = (*size) << 3;
+      rc->projected_frame_size = (int)(*size) << 3;
       vp9_restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0)
@@ -2893,8 +2906,8 @@
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) &&
-           cpi->rc.this_key_frame_forced &&
-           (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2907,9 +2920,9 @@
         // The key frame is not good enough or we can afford
         // to make it better without undue risk of popping.
         if ((kf_err > high_err_target &&
-             cpi->rc.projected_frame_size <= frame_over_shoot_limit) ||
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
             (kf_err > low_err_target &&
-             cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
           // Lower q_high
           q_high = q > q_low ? q - 1 : q_low;
 
@@ -2917,7 +2930,7 @@
           q = (q * high_err_target) / kf_err;
           q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
-                   cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
           // Raise q_low
           q_low = q < q_high ? q + 1 : q_high;
@@ -2943,10 +2956,10 @@
         // Update correction factor & compute new Q to try...
 
         // Frame is too large
-        if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+        if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
-            q_high = cpi->rc.worst_quality;
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
           q_low = q < q_high ? q + 1 : q_high;
@@ -2960,12 +2973,12 @@
             // Update rate_correction_factor unless
             vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
             }
@@ -2981,7 +2994,7 @@
             q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi, 0);
-            q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
@@ -2994,7 +3007,7 @@
 
             while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
             }
@@ -3013,8 +3026,8 @@
     }
 
     // Special case for overlay frame.
-    if (cpi->rc.is_src_frame_alt_ref &&
-        (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
     if (loop) {
@@ -3048,6 +3061,9 @@
   if (cpi->gold_is_last)
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
   if (cpi->alt_is_last)
     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
 
@@ -3082,8 +3098,8 @@
   int top_index;
   int bottom_index;
 
-  SPEED_FEATURES *const sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cm->width, cm->height);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
   struct segmentation *const seg = &cm->seg;
 
   set_ext_overrides(cpi);
@@ -3150,7 +3166,11 @@
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
+
+    // By default, encoder assumes decoder can use prev_mi.
+    cm->coding_use_prev_mi = 1;
     if (cm->error_resilient_mode) {
+      cm->coding_use_prev_mi = 0;
       cm->frame_parallel_decoding_mode = 1;
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
@@ -3218,7 +3238,7 @@
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
-    set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH));
+    set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
   }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
@@ -3411,6 +3431,7 @@
 static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
                                 int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
+
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
@@ -3424,12 +3445,12 @@
 int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON             *cm = &cpi->common;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-  const int    subsampling_x = sd->uv_width  < sd->y_width;
-  const int    subsampling_y = sd->uv_height < sd->y_height;
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  VP9_COMMON *cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->uv_width  < sd->y_width;
+  const int subsampling_y = sd->uv_height < sd->y_height;
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
@@ -3661,7 +3682,7 @@
   *size = 0;
 
   // Clear down mmx registers
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   /* find a free buffer for the new frame, releasing the reference previously
    * held.
@@ -3748,7 +3769,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (cpi->pass != 1) {
-    cpi->bytes += *size;
+    cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
       cpi->count++;
@@ -3823,22 +3844,23 @@
 
 int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+  VP9_COMMON *cm = &cpi->common;
 
-  if (!cpi->common.show_frame) {
+  if (!cm->show_frame) {
     return -1;
   } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+    ret = vp9_post_proc_frame(cm, dest, flags);
 #else
 
-    if (cpi->common.frame_to_show) {
-      *dest = *cpi->common.frame_to_show;
-      dest->y_width = cpi->common.width;
-      dest->y_height = cpi->common.height;
-      dest->uv_width = cpi->common.width >> cpi->common.subsampling_x;
-      dest->uv_height = cpi->common.height >> cpi->common.subsampling_y;
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
       ret = 0;
     } else {
       ret = -1;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1ab1814..c4b018a 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -136,7 +136,8 @@
   NSTEP = 1,
   HEX = 2,
   BIGDIA = 3,
-  SQUARE = 4
+  SQUARE = 4,
+  FAST_HEX = 5
 } SEARCH_METHODS;
 
 typedef enum {
@@ -217,6 +218,22 @@
   ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;
 
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION = 0,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION = 1,
+
+  // Use a fixed size partition in every 64X64 SB, where the size is
+  // determined based on source variance
+  VAR_BASED_FIXED_PARTITION = 2,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
 typedef struct {
   // Frame level coding parameter update
   int frame_parameter_update;
@@ -303,16 +320,6 @@
 
   // TODO(JBB): remove this as its no longer used.
 
-  // If set partition size will always be always_this_block_size.
-  int use_one_partition_size_always;
-
-  // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split.
-  int less_rectangular_check;
-
-  // Disable testing non square partitions. (eg 16x32)
-  int use_square_partition_only;
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -321,9 +328,18 @@
   // TODO(JBB): Remove this.
   int reference_masking;
 
-  // Used in conjunction with use_one_partition_size_always.
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -395,7 +411,7 @@
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 
   // This flag controls the use of non-RD mode decision.
-  int use_pick_mode;
+  int use_nonrd_pick_mode;
 
   // This variable sets the encode_breakout threshold. Currently, it is only
   // enabled in real time mode.
@@ -445,7 +461,7 @@
   YV12_BUFFER_CONFIG *un_scaled_source;
   YV12_BUFFER_CONFIG scaled_source;
 
-  unsigned int key_frame_frequency;
+  int key_frame_frequency;
 
   int gold_is_last;  // gold same as last frame ( short circuit gold searches)
   int alt_is_last;  // Alt same as last ( short circuit altref search)
@@ -486,12 +502,6 @@
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
-  unsigned int mode_chosen_counts[MAX_MODES];
-  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
-  int64_t mode_skip_mask;
-  int ref_frame_mask;
-  int set_ref_frame_mask;
-
   int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
@@ -589,6 +599,8 @@
   int fixed_divide[512];
 
 #if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
   int    count;
   double total_y;
   double total_u;
@@ -702,8 +714,6 @@
 
 void vp9_encode_frame(VP9_COMP *cpi);
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
-
 void vp9_set_speed_features(VP9_COMP *cpi);
 
 int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 6fed3d5..87f20fa 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -98,8 +98,14 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1,
-                         &cpi->fn_ptr[bsize], &ref_mv.as_mv, tmp_mv);
+  if (cpi->sf.search_method == FAST_HEX) {
+    vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, &cpi->fn_ptr[bsize],
+                        1, &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else {
+    vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps,
+                           1, &cpi->fn_ptr[bsize], &ref_mv.as_mv,
+                           &tmp_mv->as_mv);
+  }
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -128,11 +134,52 @@
   // calculate the bit cost on motion vector
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-
-
   return bestsme;
 }
 
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int dis;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  tmp_mv->as_mv.col >>= 3;
+  tmp_mv->as_mv.row >>= 3;
+
+  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                               cpi->common.allow_high_precision_mv,
+                               x->errorperbit,
+                               &cpi->fn_ptr[bsize],
+                               cpi->sf.subpel_force_stop,
+                               cpi->sf.subpel_iters_per_step,
+                               x->nmvjointcost, x->mvcost,
+                               &dis, &x->pred_sse[ref]);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+}
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -143,16 +190,21 @@
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame;
+  MB_PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  int64_t cost[4]= { 0, 100, 150,  205 };
+  static const int cost[4]= { 0, 50, 75, 100 };
+
+  const int64_t inter_mode_thresh = 300;
+  const int64_t intra_mode_cost = 50;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -162,6 +214,7 @@
 
   // initialize mode decisions
   *returnrate = INT_MAX;
+  *returndistortion = INT64_MAX;
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
@@ -196,12 +249,14 @@
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
+    mbmi->ref_frame[0] = ref_frame;
+
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate = cost[INTER_OFFSET(this_mode)];
       int64_t dist;
 
       if (this_mode == NEWMV) {
-        if (this_rd < 300)
+        if (this_rd < 500)
           continue;
 
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
@@ -210,26 +265,55 @@
 
         if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
           continue;
+
+        sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                &frame_mv[NEWMV][ref_frame]);
       }
 
-      dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      mbmi->mode = this_mode;
+      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+      dist = cpi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+                                    pd->dst.buf, pd->dst.stride, INT_MAX);
       this_rd = rate + dist;
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
-        mbmi->mode = this_mode;
-        mbmi->ref_frame[0] = ref_frame;
-        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-        xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-        mbmi->uv_mode = this_mode;
+        best_mode = this_mode;
+        best_ref_frame = ref_frame;
       }
     }
   }
 
-  // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
 
-  // TODO(jingning) intra prediction search, if the best SAD is above a certain
+  // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
+  if (best_rd > inter_mode_thresh) {
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
+                              mbmi->tx_size, this_mode,
+                              &p->src.buf[0], p->src.stride,
+                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+
+      this_rd = cpi->fn_ptr[bsize].sdf(p->src.buf,
+                                       p->src.stride,
+                                       pd->dst.buf,
+                                       pd->dst.stride, INT_MAX);
+
+      if (this_rd + intra_mode_cost < best_rd) {
+        best_rd = this_rd;
+        mbmi->mode = this_mode;
+        mbmi->ref_frame[0] = INTRA_FRAME;
+        mbmi->uv_mode = this_mode;
+        mbmi->mv[0].as_int = INVALID_MV;
+      }
+    }
+  }
 
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c
deleted file mode 100644
index 58294e1..0000000
--- a/vp9/encoder/vp9_psnr.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx_scale/yv12config.h"
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double samples, double peak, double mse) {
-  double psnr;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-  else
-    psnr = MAX_PSNR;  // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h
deleted file mode 100644
index ffe00ed..0000000
--- a/vp9/encoder/vp9_psnr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_PSNR_H_
-#define VP9_ENCODER_VP9_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-double vp9_mse2psnr(double samples, double peak, double mse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_PSNR_H_
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 862573f..f68aba4 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -9,13 +9,14 @@
  */
 
 #include <math.h>
+
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_quant_common.h"
+#include "vp9/encoder/vp9_rdopt.h"
 
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
@@ -26,7 +27,7 @@
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = count, eob = -1;
+  int i, non_zero_count = (int)count, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
   const int nzbins[2] = { zbins[0] * -1,
@@ -37,7 +38,7 @@
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = count - 1; i >= 0; i--) {
+    for (i = (int)count - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
 
@@ -151,44 +152,40 @@
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
-  int i, q;
   VP9_COMMON *const cm = &cpi->common;
+  int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
-    // y
     for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
-                               : vp9_ac_quant(q, 0);
+      // y
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
+                     : vp9_ac_quant(q, 0);
       invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
       cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->y_dequant[q][i] = quant;
-    }
 
-    // uv
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
-                               : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      // uv
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q);
       invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
       cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
-    }
 
 #if CONFIG_ALPHA
-    // alpha
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
-                               : vp9_ac_quant(q, cm->a_ac_delta_q);
+      // alpha
+      quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
+                     : vp9_ac_quant(q, cm->a_ac_delta_q);
       invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
       cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->a_dequant[q][i] = quant;
-    }
 #endif
+    }
 
     for (i = 2; i < 8; i++) {
       cpi->y_quant[q][i] = cpi->y_quant[q][1];
@@ -214,7 +211,7 @@
   }
 }
 
-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
+void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
@@ -246,7 +243,7 @@
   x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
   x->plane[3].zbin = cpi->a_zbin[qindex];
   x->plane[3].round = cpi->a_round[qindex];
-  x->plane[3].zbin_extra = (int16_t)zbin_extra;
+  x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7);
   xd->plane[3].dequant = cm->a_dequant[qindex];
 #endif
 
@@ -272,26 +269,17 @@
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  // Clear Zbin mode boost for default case
   cpi->zbin_mode_boost = 0;
-
-  // MB level quantizer setup
-  vp9_mb_init_quantizer(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->mb);
 }
 
 void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
 
+  // quantizer has to be reinitialized with vp9_init_quantizer() if any
+  // delta_q changes.
   cm->base_qindex = q;
-
-  // if any of the delta_q values are changing update flag will
-  // have to be set.
   cm->y_dc_delta_q = 0;
   cm->uv_dc_delta_q = 0;
   cm->uv_ac_delta_q = 0;
-
-  // quantizer has to be reinitialized if any delta_q changes.
-  // As there are not any here for now this is inactive code.
-  // if(update)
-  //    vp9_init_quantizer(cpi);
 }
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 680cf4a..f356b12 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -28,7 +28,7 @@
 
 void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 2427dbe..89aa821 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -8,23 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
@@ -354,7 +355,7 @@
   int projected_size_based_on_q = 0;
 
   // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
@@ -499,12 +500,10 @@
   // (at buffer = critical level).
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
-  // int active_worst_quality = rc->active_worst_quality;
-  // Maximum limit for down adjustment, ~20%.
   // Buffer level below which we push active_worst to worst_quality.
-  int critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t buff_lvl_step = 0;
   int adjustment = 0;
-  int buff_lvl_step = 0;
   int active_worst_quality;
   if (cpi->common.frame_type == KEY_FRAME)
     return rc->worst_quality;
@@ -516,10 +515,11 @@
                                rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
   if (rc->buffer_level > oxcf->optimal_buffer_level) {
     // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = (int)((oxcf->maximum_buffer_size -
-          oxcf->optimal_buffer_level) / max_adjustment_down);
+      buff_lvl_step = ((oxcf->maximum_buffer_size -
+                        oxcf->optimal_buffer_level) / max_adjustment_down);
       if (buff_lvl_step)
         adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
                             buff_lvl_step);
@@ -530,9 +530,10 @@
     if (critical_level) {
       buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                         (oxcf->optimal_buffer_level - rc->buffer_level) /
-                             buff_lvl_step;
+        adjustment =
+            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  buff_lvl_step);
       }
       active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
     }
@@ -958,17 +959,10 @@
   }
 
   // Clip the active best and worst quality values to limits.
-  if (active_worst_quality > rc->worst_quality)
-    active_worst_quality = rc->worst_quality;
-
-  if (active_best_quality < rc->best_quality)
-    active_best_quality = rc->best_quality;
-
-  if (active_best_quality > rc->worst_quality)
-    active_best_quality = rc->worst_quality;
-
-  if (active_worst_quality < active_best_quality)
-    active_worst_quality = active_best_quality;
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -1041,7 +1035,7 @@
   // JBB : This is realtime mode.  In real time mode the first frame
   // should be larger. Q of 0 is disabled because we force tx size to be
   // 16x16...
-  if (cpi->sf.use_pick_mode) {
+  if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->common.current_video_frame == 0)
       q /= 3;
     if (q == 0)
@@ -1151,7 +1145,7 @@
 
   cm->last_frame_type = cm->frame_type;
   // Update rate control heuristics
-  rc->projected_frame_size = (bytes_used << 3);
+  rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
   vp9_rc_update_rate_correction_factors(
@@ -1310,11 +1304,12 @@
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
-  const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
   int target = rc->av_per_frame_bandwidth;
-  if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
     // Note that for layers, av_per_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -1325,11 +1320,11 @@
   }
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
     target -= (target * pct_low) / 200;
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
-    const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
   return MAX(min_frame_target, target);
@@ -1337,9 +1332,11 @@
 
 static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
+  int target;
 
   if (cpi->common.current_video_frame == 0) {
-    return cpi->oxcf.starting_buffer_level / 2;
+    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
   } else {
     const int initial_boost = 32;
     int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
@@ -1347,8 +1344,9 @@
       kf_boost = (int)(kf_boost * rc->frames_since_key /
                        (cpi->output_framerate / 2));
     }
-    return ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4;
+    target = ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4;
   }
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 551b6c3..5dbc7d1 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,17 +34,17 @@
   double key_frame_rate_correction_factor;
   double gf_rate_correction_factor;
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_gf_update_due;  // Count down till next GF
-  unsigned int max_gf_interval;
-  unsigned int baseline_gf_interval;
-  unsigned int frames_to_key;
-  unsigned int frames_since_key;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-  unsigned int source_alt_ref_pending;
-  unsigned int source_alt_ref_active;
-  unsigned int is_src_frame_alt_ref;
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int max_gf_interval;
+  int baseline_gf_interval;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
 
   int av_per_frame_bandwidth;     // Average frame size target for clip
   int min_frame_bandwidth;        // Minimum allocation used for any frame
@@ -57,8 +57,8 @@
   double tot_q;
   double avg_q;
 
-  int buffer_level;
-  int bits_off_target;
+  int64_t buffer_level;
+  int64_t bits_off_target;
 
   int decimation_factor;
   int decimation_count;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index aed4de5..b57b948 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -36,6 +36,7 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -272,18 +273,12 @@
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCK *x = &cpi->mb;
-  int qindex, i;
+  int i;
 
-  vp9_clear_system_state();  // __asm emms;
-
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
+  vp9_clear_system_state();
 
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
-  cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+  cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
   x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
   x->errorperbit += (x->errorperbit == 0);
@@ -295,21 +290,22 @@
 
   set_block_thresholds(cpi);
 
-  fill_token_costs(x->token_costs, cm->fc.coef_probs);
+  if (!cpi->sf.use_nonrd_pick_mode) {
+    fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  if (!cpi->sf.use_pick_mode) {
     for (i = 0; i < PARTITION_CONTEXTS; i++)
       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
                       vp9_partition_tree);
+  }
 
+  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1) {
     fill_mode_costs(cpi);
 
     if (!frame_is_intra_only(cm)) {
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc,
-                               cm->allow_high_precision_mv, 1, 1);
+                               &cm->fc.nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)x->inter_mode_cost[i],
@@ -415,9 +411,10 @@
     *dist = 0;
   } else {
     int d_q10, r_q10;
-    uint64_t xsq_q10_64 =
+    const uint64_t xsq_q10_64 =
         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
-    int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? MAX_XSQ_Q10 : xsq_q10_64;
+    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
+                        MAX_XSQ_Q10 : (int)xsq_q10_64;
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
     *rate = (n * r_q10 + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -430,7 +427,9 @@
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  int i, rate_sum = 0, dist_sum = 0;
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
   int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
   unsigned int sse;
 
@@ -444,20 +443,33 @@
 
     if (i == 0)
       x->pred_sse[ref] = sse;
-    if (cpi->sf.use_pick_mode) {
-      dist_sum += (int)sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->speed > 4) {
+      int64_t rate;
+      int64_t dist;
+      int64_t square_error = sse;
+      int quantizer = (pd->dequant[1] >> 3);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> 8;
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
     } else {
       int rate;
       int64_t dist;
       model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
                                pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
-      dist_sum += (int)dist;
+      dist_sum += dist;
     }
   }
 
-  *out_rate_sum = rate_sum;
-  *out_dist_sum = (int64_t)dist_sum << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum << 4;
 }
 
 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
@@ -508,15 +520,15 @@
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-    sqcoeff += (unsigned) coeff[i] * coeff[i];
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
   }
 
   *ssz = sqcoeff;
@@ -547,18 +559,16 @@
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
-  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
+  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][ref];
-  const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
+                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t *p_tok = x->token_cache;
-  int pt = combine_entropy_contexts(above_ec, left_ec);
+  int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
 
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                                      : get_uv_tx_size(mbmi) == tx_size);
+                              : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
@@ -568,7 +578,7 @@
     int band_left = *band_count++;
 
     // dc token
-    int v = qcoeff_ptr[0];
+    int v = qcoeff[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     p_tok[0] = vp9_pt_energy_class[prev_t];
@@ -579,7 +589,7 @@
       const int rc = scan[c];
       int t;
 
-      v = qcoeff_ptr[rc];
+      v = qcoeff[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
       pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
@@ -676,10 +686,16 @@
   }
 }
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h) {
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
   int i;
   switch (tx_size) {
     case TX_4X4:
@@ -716,9 +732,6 @@
                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
   struct rdcost_block_args args = { 0 };
   args.x = x;
   args.best_rd = ref_best_rd;
@@ -726,9 +739,7 @@
   if (plane == 0)
     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 
@@ -844,6 +855,11 @@
   }
 }
 
+static int64_t scaled_rd_cost(int rdmult, int rddiv,
+                              int rate, int64_t dist, double scale) {
+  return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
+}
+
 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int (*r)[2], int *rate,
                                           int64_t *d, int64_t *distortion,
@@ -881,10 +897,13 @@
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
+      rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
+                                           scale);
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+      rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
+                                scale);
+      rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
+                                scale);
     }
     if (rd[n][1] < best_rd) {
       best_rd = rd[n][1];
@@ -911,27 +930,23 @@
   }
 }
 
-static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE bs,
-                            int64_t txfm_cache[TX_MODES],
-                            int64_t ref_best_rd) {
+static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  const int b_inter_mode = is_inter_block(mbmi);
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE tx_size;
 
-
   assert(bs == mbmi->sb_type);
-  if (b_inter_mode)
-    vp9_subtract_sby(x, bs);
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       !b_inter_mode)) {
+  vp9_subtract_plane(x, bs, 0);
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -940,8 +955,7 @@
     return;
   }
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      b_inter_mode) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
@@ -959,6 +973,36 @@
     *psse = sse[mbmi->tx_size];
 }
 
+static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
+  int64_t sse[TX_SIZES];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+  assert(bs == mbmi->sb_type);
+  if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+                             ref_best_rd, bs);
+  } else {
+    int r[TX_SIZES][2], s[TX_SIZES];
+    int64_t d[TX_SIZES];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
+  }
+  if (psse)
+    *psse = sse[mbmi->tx_size];
+}
+
+
 static int conditional_skipintra(MB_PREDICTION_MODE mode,
                                  MB_PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
@@ -988,10 +1032,9 @@
                                      int64_t *bestdistortion,
                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-  int rate = 0;
-  int64_t distortion;
+
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
@@ -1000,8 +1043,6 @@
                                                             src_stride)];
   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
                                                        dst_stride)];
-  int16_t *src_diff, *coeff;
-
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -1019,6 +1060,8 @@
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
 
     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
@@ -1030,55 +1073,50 @@
           continue;
     }
 
-    rate = bmode_costs[mode];
-    distortion = 0;
-
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        int64_t ssz;
-        const scan_order *so;
-        const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
-        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
-        TX_TYPE tx_type;
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                            p->src_diff);
+        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
-        src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8,
-                           src, src_stride,
-                           dst, dst_stride);
+        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
-        tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-        so = &vp9_scan_orders[TX_4X4][tx_type];
-
-        if (tx_type != DCT_DCT)
+        if (xd->lossless) {
+          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          vp9_fwht4x4(src_diff, coeff, 8);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+                          p->eobs[block]);
+        } else {
+          int64_t unused;
+          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-        else
-          x->fwd_txm4x4(src_diff, coeff, 8);
-
-        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-
-        ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                             so->scan, so->neighbors);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                      16, &ssz) >> 2;
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
-
-        if (tx_type != DCT_DCT)
-          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                               dst, pd->dst.stride, tx_type);
-        else
-          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
-                       16);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                         dst, dst_stride, p->eobs[block]);
+        }
       }
     }
 
@@ -1219,8 +1257,8 @@
     }
     mic->mbmi.mode = mode;
 
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, local_tx_cache, best_rd);
+    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1255,7 +1293,7 @@
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
+static void super_block_uvrd(MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1269,8 +1307,11 @@
   if (ref_best_rd < 0)
     goto term;
 
-  if (is_inter_block(mbmi))
-    vp9_subtract_sbuv(x, bsize);
+  if (is_inter_block(mbmi)) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
 
   *rate = 0;
   *distortion = 0;
@@ -1302,6 +1343,7 @@
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -1312,9 +1354,9 @@
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
+    xd->mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(cpi, x, &this_rate_tokenonly,
+    super_block_uvrd(x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1332,7 +1374,7 @@
       if (!x->select_txfm_size) {
         int i;
         struct macroblock_plane *const p = x->plane;
-        struct macroblockd_plane *const pd = x->e_mbd.plane;
+        struct macroblockd_plane *const pd = xd->plane;
         for (i = 1; i < MAX_MB_PLANE; ++i) {
           p[i].coeff    = ctx->coeff_pbuf[i][2];
           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
@@ -1353,25 +1395,21 @@
     }
   }
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
+  xd->mi_8x8[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
-static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
                               BLOCK_SIZE bsize) {
-  int64_t this_rd;
-  int64_t this_sse;
+  int64_t unused;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
-                   skippable, &this_sse, bsize, INT64_MAX);
-  *rate = *rate_tokenonly +
-          x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
-  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-
-  return this_rd;
+  super_block_uvrd(x, rate_tokenonly, distortion,
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
@@ -1384,8 +1422,8 @@
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
-    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                   bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+    rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
@@ -1399,8 +1437,7 @@
 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  const int segment_id = x->e_mbd.mi_8x8[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
@@ -1425,7 +1462,7 @@
                                 int *rate_mv);
 
 static int labels2mode(MACROBLOCK *x, int i,
-                       MB_PREDICTION_MODE this_mode,
+                       MB_PREDICTION_MODE mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                        int_mv seg_mvs[MAX_REF_FRAMES],
@@ -1435,23 +1472,18 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi_8x8[0];
   MB_MODE_INFO *mbmi = &mic->mbmi;
-  int cost = 0, thismvcost = 0;
+  int thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int has_second_rf = has_second_ref(mbmi);
 
-  /* We have to be careful retrieving previously-encoded motion vectors.
-   Ones from this macroblock have to be pulled from the BLOCKD array
-   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  MB_PREDICTION_MODE m;
-
   // the only time we should do costing for new motion vector or mode
   // is when we are on a new label  (jbb May 08, 2007)
-  switch (m = this_mode) {
+  switch (mode) {
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+      thismvcost += vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
@@ -1463,14 +1495,12 @@
     case NEARESTMV:
       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
       break;
     case NEARMV:
       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
       this_mv->as_int = 0;
@@ -1481,22 +1511,19 @@
       break;
   }
 
-  cost = cost_mv_ref(cpi, this_mode,
-                     mbmi->mode_context[mbmi->ref_frame[0]]);
-
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   if (has_second_rf)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-  mic->bmi[i].as_mode = m;
+  mic->bmi[i].as_mode = mode;
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  cost += thismvcost;
-  return cost;
+  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
+            thismvcost;
 }
 
 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
@@ -1819,28 +1846,28 @@
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
                                              &bsi->ref_mv->as_mv,
-                                             new_mv);
+                                             &new_mv->as_mv);
           }
 
           // Should we do a full search (best quality only)
           if (cpi->oxcf.mode == MODE_BESTQUALITY ||
               cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
+            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
-
             thissme = cpi->full_search_sad(x, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           &bsi->ref_mv->as_mv, i);
-
+                                           &bsi->ref_mv->as_mv,
+                                           &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
-              new_mv->as_int = mi->bmi[i].as_mv[0].as_int;
+              new_mv->as_int = best_mv->as_int;
             } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              mi->bmi[i].as_mv[0].as_int = new_mv->as_int;
+              // The full search result is actually worse so re-instate the
+              // previous best vector
+              best_mv->as_int = new_mv->as_int;
             }
           }
 
@@ -2345,7 +2372,7 @@
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2355,10 +2382,10 @@
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
 
-  int_mv pred_mv[3];
-  pred_mv[0] = mbmi->ref_mvs[ref][0];
-  pred_mv[1] = mbmi->ref_mvs[ref][1];
-  pred_mv[2] = x->pred_mv[ref];
+  MV pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref].as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2371,7 +2398,7 @@
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_set_mv_search_range(x, &ref_mv.as_mv);
+  vp9_set_mv_search_range(x, &ref_mv);
 
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
@@ -2416,7 +2443,7 @@
     }
   }
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]].as_mv;
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
@@ -2424,23 +2451,27 @@
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  if (cpi->sf.search_method == HEX) {
+  if (cpi->sf.search_method == FAST_HEX) {
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == HEX) {
     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
                              &cpi->fn_ptr[bsize], 1,
-                             &ref_mv.as_mv, &tmp_mv->as_mv);
+                             &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
                                      &cpi->fn_ptr[bsize],
-                                     &ref_mv.as_mv, tmp_mv);
+                                     &ref_mv, &tmp_mv->as_mv);
   }
 
   x->mv_col_min = tmp_col_min;
@@ -2450,7 +2481,7 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[bsize],
@@ -2459,7 +2490,7 @@
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref]);
   }
-  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
@@ -2686,6 +2717,8 @@
       int_mv tmp_mv;
       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                            &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
@@ -2698,7 +2731,7 @@
       frame_mv[refs[0]].as_int == 0 &&
       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
+    int rfc = mbmi->mode_context[refs[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -2713,17 +2746,17 @@
       assert(this_mode == ZEROMV);
       if (num_refs == 1) {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0))
           return INT64_MAX;
       } else {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARESTMV][refs[1]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARMV][refs[1]].as_int == 0))
           return INT64_MAX;
       }
     }
@@ -2754,8 +2787,7 @@
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode,
-                        mbmi->mode_context[mbmi->ref_frame[0]]);
+  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
   if (!(*mode_excluded))
     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -2905,8 +2937,8 @@
       const unsigned int max_thresh = (cpi->allow_encode_breakout ==
                                       ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
       // The encode_breakout input
-      const unsigned int min_thresh = ((x->encode_breakout << 4) > max_thresh) ?
-                                      max_thresh : (x->encode_breakout << 4);
+      const unsigned int min_thresh =
+          MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
       // Calculate threshold according to dequant value.
       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
@@ -2973,8 +3005,8 @@
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                    bsize, txfm_cache, ref_best_rd);
+    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                          bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2989,7 +3021,7 @@
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3097,10 +3129,10 @@
                                   BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const struct segmentation *seg = &cm->seg;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -3136,12 +3168,14 @@
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
+  int mode_skip_mask = 0;
+  const int mode_skip_start = cpi->sf.mode_skip_start + 1;
+  const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
+  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
-  // Everywhere the flag is set the error is much higher than its neighbors.
-  ctx->modes_with_high_error = 0;
-
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
@@ -3169,16 +3203,72 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
-        break;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // All modes from vp9_mode_order that use this frame as any ref
+    static const int ref_frame_mask_all[] = {
+        0x0, 0x123291, 0x25c444, 0x39b722
+    };
+    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
+    // this frame as their primary ref
+    static const int ref_frame_mask_fixedmv[] = {
+        0x0, 0x121281, 0x24c404, 0x080102
+    };
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+      // Skip modes for missing references
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    } else if (cpi->sf.reference_masking) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          break;
+        }
       }
     }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    }
+  }
+
+  // If the segment skip feature is enabled....
+  // then do nothing if the current mode is not allowed..
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    const int inter_non_zero_mode_mask = 0x1F7F7;
+    mode_skip_mask |= inter_non_zero_mode_mask;
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      const int altref_zero_mask =
+          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      mode_skip_mask |= altref_zero_mask;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARA);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARESTA);
+    }
+  }
+
+  // TODO(JBB): This is to make up for the fact that we don't have sad
+  // functions that work when the block size reads outside the umv.  We
+  // should fix this either by making the motion search just work on
+  // a representative block in the boundary ( first ) and then implement a
+  // function that does sads when inside the border..
+  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
+    const int new_modes_mask =
+        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
+        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
+    mode_skip_mask |= new_modes_mask;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3192,103 +3282,95 @@
     int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
-    int64_t total_sse = INT_MAX;
+    int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
-    for (i = 0; i < TX_MODES; ++i)
-      tx_cache[i] = INT64_MAX;
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (mode_index == mode_skip_start) {
+      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+        case INTRA_FRAME:
+          break;
+        case LAST_FRAME:
+          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          break;
+        case GOLDEN_FRAME:
+          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          break;
+        case ALTREF_FRAME:
+          mode_skip_mask |= ALT_REF_MODE_MASK;
+          break;
+        case NONE:
+        case MAX_REF_FRAMES:
+          assert(0 && "Invalid Reference frame");
+      }
+    }
+    if (mode_skip_mask & (1 << mode_index))
+      continue;
 
-    x->skip = 0;
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd < ((int64_t)rd_threshes[mode_index] *
+                  rd_thresh_freq_fact[mode_index] >> 5) ||
+        rd_threshes[mode_index] == INT_MAX)
+     continue;
+
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
-    // Look at the reference frame of the best mode so far and set the
-    // skip mask to look at a subset of the remaining modes.
-    if (mode_index > cpi->sf.mode_skip_start) {
-      if (mode_index == (cpi->sf.mode_skip_start + 1)) {
-        switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
-          case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
-            break;
-          case LAST_FRAME:
-            cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
-            break;
-          case GOLDEN_FRAME:
-            cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
-            break;
-          case ALTREF_FRAME:
-            cpi->mode_skip_mask = ALT_REF_MODE_MASK;
-            break;
-          case NONE:
-          case MAX_REF_FRAMES:
-            assert(0 && "Invalid Reference frame");
-        }
-      }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
-        continue;
-    }
-
-    // Skip if the current reference frame has been masked off
-    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
-      continue;
-
-    // Test best rd so far against threshold for trying this mode.
-    if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
-        cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
-      continue;
-
-    // Do not allow compound prediction if the segment level reference
-    // frame feature is in use as in this case there can only be one reference.
-    if ((second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      continue;
-
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-    if (!(second_ref_frame == NONE
-        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
-      continue;
-    }
-
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-        if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
-          continue;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-        if (ref_frame != best_inter_ref_frame &&
-            second_ref_frame != best_inter_ref_frame)
-          continue;
-    }
-
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-    mbmi->uv_mode = DC_PRED;
-
-    // Evaluate all sub-pel filters irrespective of whether we can use
-    // them for this frame.
-    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
-                                                          : cm->interp_filter;
-    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
-
-    if (comp_pred) {
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
         continue;
-
-      mode_excluded = mode_excluded ? mode_excluded
-                                    : cm->reference_mode == SINGLE_REFERENCE;
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
+          ref_frame != best_inter_ref_frame &&
+          second_ref_frame != best_inter_ref_frame)
+        continue;
+      mode_excluded = mode_excluded ?
+            mode_excluded : cm->reference_mode == SINGLE_REFERENCE;
     } else {
       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
         mode_excluded = mode_excluded ?
             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
     }
 
+    if (ref_frame == INTRA_FRAME) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      };
+      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          this_mode != DC_PRED &&
+          x->source_variance < skip_intra_var_thresh[bsize])
+        continue;
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+        if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(this_mode, best_intra_mode))
+            continue;
+      }
+    }
+
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+    xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
@@ -3296,46 +3378,8 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
-      continue;
-    // Disable this drop out case if the ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
-            ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-    // TODO(JBB): This is to make up for the fact that we don't have sad
-    // functions that work when the block size reads outside the umv.  We
-    // should fix this either by making the motion search just work on
-    // a representative block in the boundary ( first ) and then implement a
-    // function that does sads when inside the border..
-    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
-      continue;
-    }
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
 
 #ifdef MODE_TEST_HIT_STATS
     // TEST/DEBUG CODE
@@ -3343,34 +3387,10 @@
     cpi->mode_test_hits[bsize]++;
 #endif
 
-
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      // Disable intra modes other than DC_PRED for blocks with low variance
-      // Threshold for intra skipping based on source variance
-      // TODO(debargha): Specialize the threshold for super block sizes
-      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
-        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      };
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
-          x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
-        continue;
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-        if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
-          continue;
-      }
-      mbmi->mode = this_mode;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(mbmi->mode, best_intra_mode))
-            continue;
-      }
-
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
-                      bsize, tx_cache, best_rd);
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+                            bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
@@ -3392,8 +3412,6 @@
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
-      mbmi->mode = this_mode;
-      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, tile, bsize,
                                   tx_cache,
                                   &rate2, &distortion2, &skippable,
@@ -3405,14 +3423,16 @@
                                   single_newmv, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
-    }
 
-    if (cm->reference_mode == REFERENCE_MODE_SELECT)
-      rate2 += compmode_cost;
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        rate2 += compmode_cost;
+    }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    if (second_ref_frame > INTRA_FRAME) {
+    if (comp_pred) {
       rate2 += ref_costs_comp[ref_frame];
     } else {
       rate2 += ref_costs_single[ref_frame];
@@ -3520,7 +3540,7 @@
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
@@ -3630,17 +3650,6 @@
     }
   }
 
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      ctx->modes_with_high_error |= (1 << mode_index);
-    }
-  }
-
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
@@ -3755,6 +3764,8 @@
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
+  int ref_frame_mask = 0;
+  int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3790,13 +3801,12 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
   for (ref_frame = LAST_FRAME;
        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
     int i;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
+        ref_frame_mask |= (1 << ref_frame);
         break;
       }
     }
@@ -3829,23 +3839,23 @@
       if (mode_index == 3) {
         switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
           case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
+            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            cpi->mode_skip_mask = 0x0010;
+            mode_skip_mask = 0x0010;
             break;
           case GOLDEN_FRAME:
-            cpi->mode_skip_mask = 0x0008;
+            mode_skip_mask = 0x0008;
             break;
           case ALTREF_FRAME:
-            cpi->mode_skip_mask = 0x0000;
+            mode_skip_mask = 0x0000;
             break;
           case NONE:
           case MAX_REF_FRAMES:
             assert(0 && "Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+      if (mode_skip_mask & (1 << mode_index))
         continue;
     }
 
@@ -4136,7 +4146,7 @@
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
@@ -4355,7 +4365,7 @@
 
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
-    *returndistortion = INT_MAX;
+    *returndistortion = INT64_MAX;
     return best_rd;
   }
 
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 96cea42..6b85d67 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -80,10 +80,10 @@
 void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
                             const MV *mv);
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h);
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 0766b51..4e6efae 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
 
@@ -24,9 +25,6 @@
 #define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
 #define INTERP_PRECISION_BITS     32
 
-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
 typedef int16_t interp_kernel[INTERP_TAPS];
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 0040477..502e4b6 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -20,7 +20,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 510ef78..e8179f3 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,8 +23,8 @@
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
+static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
+const int16_t *vp9_dct_value_cost_ptr;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -199,6 +199,12 @@
   ++counts[token];
 }
 
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -214,7 +220,7 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
+  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
@@ -241,7 +247,7 @@
   while (c < eob) {
     int v = 0;
     int skip_eob = 0;
-    v = qcoeff_ptr[scan[c]];
+    v = qcoeff[scan[c]];
 
     while (!v) {
       add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
@@ -252,12 +258,13 @@
       token_cache[scan[c]] = 0;
       ++c;
       pt = get_coef_context(nb, token_cache, c);
-      v = qcoeff_ptr[scan[c]];
+      v = qcoeff[scan[c]];
     }
 
     add_token(&t, coef_probs[band[c]][pt],
               vp9_dct_value_tokens_ptr[v].extra,
-              vp9_dct_value_tokens_ptr[v].token, skip_eob,
+              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+              (uint8_t)skip_eob,
               counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index ea86240..063c0ba 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -47,7 +47,7 @@
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-extern const int *vp9_dct_value_cost_ptr;
+extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
  *  fields are not.
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index acd7c41..600029b 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -44,7 +44,7 @@
 double vp9_vaq_rdmult_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return RDMULT_RATIO(energy);
 }
@@ -52,7 +52,7 @@
 double vp9_vaq_inv_q_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return Q_RATIO(-energy);
 }
@@ -63,7 +63,7 @@
 
   assert(ENERGY_SPAN <= MAX_SEGMENTS);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   base_ratio = 1.5;
 
@@ -75,9 +75,9 @@
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
-  int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                        cm->y_dc_delta_q);
+  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+                                              cm->y_dc_delta_q);
   int i;
 
   if (cm->frame_type == KEY_FRAME ||
@@ -88,7 +88,7 @@
 
     seg->abs_delta = SEGMENT_DELTADATA;
 
-    vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
     for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
       int qindex_delta, segment_rdmult;
@@ -141,11 +141,8 @@
   double energy;
   unsigned int var = block_variance(cpi, x, bs);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  // if (var <= 1000)
-  //   return 0;
-
-  energy = 0.9*(logf(var + 1) - 10.0);
-  return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+  energy = 0.9 * (log(var + 1.0) - 10.0);
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 5958b48..1795e05 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -29,7 +29,7 @@
 }
 
 static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = wb->bit_offset;
+  const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
   if (q == CHAR_BIT -1) {
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index 2b82d97..b5269ed 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -16,7 +16,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -46,7 +46,7 @@
     in3 = _mm_slli_epi16(in3, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -59,7 +59,7 @@
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in3);
     const __m128i r1 = _mm_add_epi16(in1, in2);
     const __m128i r2 = _mm_sub_epi16(in1, in2);
@@ -317,7 +317,7 @@
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -328,7 +328,7 @@
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -390,7 +390,7 @@
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1071,7 +1071,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1228,7 +1228,7 @@
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1239,7 +1239,7 @@
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1303,7 +1303,7 @@
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 852cf86..f3735eb 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -16,7 +16,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -47,7 +47,7 @@
     in1 = _mm_slli_epi16(in1, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -60,7 +60,7 @@
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in1);
     const __m128i r1 = _mm_sub_epi16(in0, in1);
     const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
@@ -315,7 +315,7 @@
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -326,7 +326,7 @@
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -388,7 +388,7 @@
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1069,7 +1069,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1226,7 +1226,7 @@
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1237,7 +1237,7 @@
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1301,7 +1301,7 @@
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 0000000..b8bfa89
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -0,0 +1,641 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
+};
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg,  dst_reg,  exp_src_lo,  exp_src_hi,  exp_dst_lo,  exp_dst_hi;
+  __m256i sse_reg,  sum_reg,  sse_reg_hi,  res_cmp,  sum_reg_lo,  sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  if (x_offset == 0) {
+    // x_offset = 0 and y_offset = 0
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        // load source and destination
+        src_reg = _mm256_loadu_si256((__m256i const *) (src));
+        dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+        // expend each byte to 2 bytes
+        exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+        exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+        exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+        exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+        // source - dest
+        exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+        exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+        // calculate sum
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+        exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+        exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+        // calculate sse
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source + next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                                         (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+          // average between current and next stride source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expend each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t y_offset64;
+        y_offset64 = y_offset;
+        y_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+        y_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load current and next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                          (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 byte in the destination
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+      if (y_offset == 0) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // average between source and the next byte following source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expand each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = 8  and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i src_next_reg, src_avg;
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // average between previous average to current average
+            src_avg = _mm256_avg_epu8(src_avg, src_reg);
+            // expand each byte to 2 bytes
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      // x_offset = 8  and y_offset = bilin interpolation
+      } else {
+          __m256i filter, pw8, src_next_reg, src_avg;
+#if (ARCH_X86_64)
+          int64_t y_offset64;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          y_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // merge previous average and current average
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to the source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide the source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+      if (y_offset == 0) {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t x_offset64;
+        x_offset64 = x_offset;
+        x_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+        x_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 bytes
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = bilin interpolation and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i filter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+          x_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+            // average between previous pack to the current
+            src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            // save previous pack
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      } else {
+          __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64, y_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          x_offset <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+          y_offset <<= 5;
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+            // merge previous pack to current pack source
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // caculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      }
+  }
+  // sum < 0
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
+  // save the next 8 bytes of each lane of sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
+  // merge the result of sum < 0  with sum to add sign to the next 16 bits
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
+  // add each 8 bytes from every lane of sse and sum
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+
+  // save the next 4 bytes of each lane sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
+  // save the next 8 bytes of each lane of sum
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+
+  // add the first 4 bytes to the next 4 bytes sse
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  // add the first 8 bytes to the next 8 bytes
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  // extract the low lane and the high lane and add the results
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+  return sum;
+}
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index c9b90d5..02007a3 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -42,6 +42,18 @@
   int *Sum
 );
 
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  int height,
+  unsigned int *sse
+);
+
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                         const unsigned char *ref_ptr, int  recon_stride,
                         int  w, int  h, unsigned int *sse, int *sum,
@@ -155,3 +167,43 @@
   *sse = var;
   return (var - (((int64_t)avg * avg) >> 11));
 }
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           64, &sse);
+  // processing the next 32 elements in parallel
+  unsigned int sse2;
+  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                            x_offset, y_offset,
+                                            dst + 32, dst_stride,
+                                            64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           32, &sse);
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 48d6a7c..9fb6115 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -25,7 +25,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
-VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c
 VP9_COMMON_SRCS-yes += common/vp9_idct.c
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
 VP9_COMMON_SRCS-yes += common/vp9_blockd.h
@@ -80,6 +79,7 @@
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f5d5b24..d7713fd 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -178,7 +178,7 @@
 
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
   if (cfg->ts_number_layers > 1) {
-    int i;
+    unsigned int i;
     for (i = 1; i < cfg->ts_number_layers; ++i) {
       if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i-1]) {
         ERROR("ts_target_bitrate entries are not increasing");
@@ -264,7 +264,7 @@
 
 static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
                                        vpx_codec_enc_cfg_t cfg,
-                                       struct vp9_extracfg vp8_cfg) {
+                                       struct vp9_extracfg vp9_cfg) {
   oxcf->version = cfg.g_profile;
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
@@ -289,10 +289,8 @@
   }
 
   if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag     = 0;
     oxcf->lag_in_frames = 0;
   } else {
-    oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
     oxcf->lag_in_frames = cfg.g_lag_in_frames;
   }
 
@@ -305,11 +303,11 @@
     oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
 
   oxcf->target_bandwidth         = cfg.rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+  oxcf->rc_max_intra_bitrate_pct = vp9_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
   oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
-  oxcf->cq_level                = vp8_cfg.cq_level;
+  oxcf->cq_level                = vp9_cfg.cq_level;
   oxcf->fixed_q = -1;
 
   oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
@@ -330,33 +328,40 @@
   // oxcf->kf_min_dist         = cfg.kf_min_dis;
   oxcf->key_freq               = cfg.kf_max_dist;
 
-  oxcf->cpu_used               =  vp8_cfg.cpu_used;
-  oxcf->encode_breakout        =  vp8_cfg.static_thresh;
-  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
-  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->sharpness              =  vp8_cfg.sharpness;
+  oxcf->cpu_used               =  vp9_cfg.cpu_used;
+  oxcf->encode_breakout        =  vp9_cfg.static_thresh;
+  oxcf->play_alternate         =  vp9_cfg.enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  vp9_cfg.noise_sensitivity;
+  oxcf->sharpness              =  vp9_cfg.sharpness;
 
   oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
+  oxcf->output_pkt_list        =  vp9_cfg.pkt_list;
 
-  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
-  oxcf->arnr_type       = vp8_cfg.arnr_type;
+  oxcf->arnr_max_frames = vp9_cfg.arnr_max_frames;
+  oxcf->arnr_strength   = vp9_cfg.arnr_strength;
+  oxcf->arnr_type       = vp9_cfg.arnr_type;
 
-  oxcf->tuning = vp8_cfg.tuning;
+  oxcf->tuning = vp9_cfg.tuning;
 
-  oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows    = vp8_cfg.tile_rows;
+  oxcf->tile_columns = vp9_cfg.tile_columns;
+  oxcf->tile_rows    = vp9_cfg.tile_rows;
 
-  oxcf->lossless = vp8_cfg.lossless;
+  oxcf->lossless = vp9_cfg.lossless;
 
   oxcf->error_resilient_mode         = cfg.g_error_resilient;
-  oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
+  oxcf->frame_parallel_decoding_mode = vp9_cfg.frame_parallel_decoding_mode;
 
-  oxcf->aq_mode = vp8_cfg.aq_mode;
+  oxcf->aq_mode = vp9_cfg.aq_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
 
+  if (oxcf->ss_number_layers > 1) {
+    memcpy(oxcf->ss_target_bitrate, cfg.ss_target_bitrate,
+           sizeof(cfg.ss_target_bitrate));
+  } else if (oxcf->ss_number_layers == 1) {
+    oxcf->ss_target_bitrate[0] = oxcf->target_bandwidth;
+  }
+
   oxcf->ts_number_layers = cfg.ts_number_layers;
 
   if (oxcf->ts_number_layers > 1) {
@@ -365,7 +370,7 @@
     memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator,
            sizeof(cfg.ts_rate_decimator));
   } else if (oxcf->ts_number_layers == 1) {
-    oxcf->ts_target_bitrate[0] = oxcf->target_bandwidth;
+    oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
     oxcf->ts_rate_decimator[0] = 1;
   }
 
@@ -390,7 +395,6 @@
   printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
-  printf("allow_lag: %d\n", oxcf->allow_lag);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
   printf("play_alternate: %d\n", oxcf->play_alternate);
   printf("Version: %d\n", oxcf->Version);
@@ -639,7 +643,7 @@
 
     *x++ = marker;
     for (i = 0; i < ctx->pending_frame_count; i++) {
-      int this_sz = ctx->pending_frame_sizes[i];
+      unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
@@ -1045,11 +1049,11 @@
   cpi->svc.temporal_layer_id = data->temporal_layer_id;
   // Checks on valid layer_id input.
   if (cpi->svc.temporal_layer_id < 0 ||
-      cpi->svc.temporal_layer_id >= ctx->cfg.ts_number_layers) {
+      cpi->svc.temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
   if (cpi->svc.spatial_layer_id < 0 ||
-      cpi->svc.spatial_layer_id >= ctx->cfg.ss_number_layers) {
+      cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_OK;
@@ -1163,6 +1167,7 @@
       9999,               /* kf_max_dist */
 
       VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+      {0},                /* ss_target_bitrate */
       1,                  /* ts_number_layers */
       {0},                /* ts_target_bitrate */
       {0},                /* ts_rate_decimator */
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 41750de..b85e172 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -60,6 +60,11 @@
   int                     img_setup;
   int                     img_avail;
   int                     invert_tile_order;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
 };
 
 static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
@@ -154,11 +159,7 @@
 
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
-#if CONFIG_NON420
     if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
-#else
-    if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM;
-#endif
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
       return VPX_CODEC_OK;
@@ -209,7 +210,7 @@
                        ? sizeof(vp9_stream_info_t)
                        : sizeof(vpx_codec_stream_info_t);
   memcpy(si, &ctx->si, sz);
-  si->sz = sz;
+  si->sz = (unsigned int)sz;
 
   return VPX_CODEC_OK;
 }
@@ -300,16 +301,22 @@
         VP9D_COMP *const pbi = (VP9D_COMP*)optr;
         VP9_COMMON *const cm = &pbi->common;
 
-        cm->get_fb_cb = vp9_get_frame_buffer;
-        cm->release_fb_cb = vp9_release_frame_buffer;
-
         // Set index to not initialized.
         cm->new_fb_idx = -1;
 
-        if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-          vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to initialize internal frame buffers");
-        cm->cb_priv = &cm->int_frame_buffers;
+        if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+          cm->get_fb_cb = ctx->get_ext_fb_cb;
+          cm->release_fb_cb = ctx->release_ext_fb_cb;
+          cm->cb_priv = ctx->ext_priv;
+        } else {
+          cm->get_fb_cb = vp9_get_frame_buffer;
+          cm->release_fb_cb = vp9_release_frame_buffer;
+
+          if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to initialize internal frame buffers");
+          cm->cb_priv = &cm->int_frame_buffers;
+        }
 
         ctx->pbi = optr;
       }
@@ -350,7 +357,11 @@
 
     if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
                                        &time_end_stamp, &flags)) {
+      VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi;
+      VP9_COMMON *const cm = &pbi->common;
       yuvconfig2image(&ctx->img, &sd, user_priv);
+
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       ctx->img_avail = 1;
     }
   }
@@ -447,7 +458,7 @@
     while (data_start < data_end && *data_start == 0)
       data_start++;
 
-    data_sz = data_end - data_start;
+    data_sz = (unsigned int)(data_end - data_start);
   } while (data_start < data_end);
   return res;
 }
@@ -470,6 +481,24 @@
   return img;
 }
 
+static vpx_codec_err_t vp9_set_fb_fn(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (ctx->pbi == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
 static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
                                         vpx_codec_mmap_t *mmap,
                                         vpx_codec_iter_t *iter) {
@@ -703,7 +732,8 @@
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,
   /* vpx_codec_caps_t          caps; */
   vp9_init,         /* vpx_codec_init_fn_t       init; */
   vp9_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
@@ -715,6 +745,7 @@
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    vp9_set_fb_fn,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
   },
   { // NOLINT
     /* encoder functions */
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c0d973b..6679f89 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -38,7 +38,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
 VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
@@ -50,7 +49,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
@@ -86,6 +84,7 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 
diff --git a/vpx/exports_dec b/vpx/exports_dec
index ed121f7..3ce1499 100644
--- a/vpx/exports_dec
+++ b/vpx/exports_dec
@@ -6,4 +6,5 @@
 text vpx_codec_peek_stream_info
 text vpx_codec_register_put_frame_cb
 text vpx_codec_register_put_slice_cb
+text vpx_codec_set_frame_buffer_functions
 text vpx_codec_set_mem_map
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 0f42a1d..51ca65e 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -59,7 +59,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/
+#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
 
 typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
 typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
@@ -218,6 +218,36 @@
 typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_iter_t     *iter);
 
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libvpx needs a frame buffer
+ * to decode the current frame and a function to be called when libvpx does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libvpx will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     External frame buffers will be used by libvpx.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding VP9, the application may be required to pass in at least
+ * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 
 /*\brief eXternal Memory Allocation memory map get iterator
  *
@@ -308,6 +338,7 @@
     vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
     vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
     vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
+    vpx_codec_set_fb_fn_t     set_fb_fn;   /**< \copydoc ::vpx_codec_set_fb_fn_t */
   } dec;
   struct vpx_codec_enc_iface {
     vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
diff --git a/vpx/internal/vpx_psnr.h b/vpx/internal/vpx_psnr.h
new file mode 100644
index 0000000..07d81bb
--- /dev/null
+++ b/vpx/internal/vpx_psnr.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_INTERNAL_VPX_PSNR_H_
+#define VPX_INTERNAL_VPX_PSNR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 4f5ba6f..5537fb5 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -13,6 +13,7 @@
  * VP9 SVC encoding support via libvpx
  */
 
+#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -23,11 +24,13 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#if defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)
+#ifdef __MINGW32__
 #define strtok_r strtok_s
+#ifndef MINGW_HAS_SECURE_API
 // proto from /usr/x86_64-w64-mingw32/include/sec_api/string_s.h
 _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
-#endif
+#endif  /* MINGW_HAS_SECURE_API */
+#endif  /* __MINGW32__ */
 
 #ifdef _MSC_VER
 #define strdup _strdup
@@ -38,6 +41,7 @@
 #define SUPERFRAME_SLOTS (8)
 #define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
 #define OPTION_BUFFER_SIZE 256
+#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
 
 static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
 static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
@@ -45,16 +49,20 @@
 typedef struct SvcInternal {
   char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
   char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
+  char quantizers_keyframe[OPTION_BUFFER_SIZE];  // set by
+                                                 // vpx_svc_set_quantizers
   char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors
 
   // values extracted from option, quantizers
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int quantizer_keyframe[VPX_SS_MAX_LAYERS];
   int quantizer[VPX_SS_MAX_LAYERS];
 
   // accumulated statistics
-  double psnr_in_layer[VPX_SS_MAX_LAYERS];
-  uint32_t bytes_in_layer[VPX_SS_MAX_LAYERS];
+  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
+  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
 
   // codec encoding values
   int width;    // width of highest layer
@@ -266,7 +274,8 @@
 }
 
 static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values) {
+                                              const char *quantizer_values,
+                                              const int is_keyframe) {
   char *input_string;
   char *token;
   const char *delim = ",";
@@ -277,6 +286,11 @@
   SvcInternal *const si = get_svc_internal(svc_ctx);
 
   if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
+    if (is_keyframe) {
+      // If there non settings for key frame, we will apply settings from
+      // non key frame. So just simply return here.
+      return VPX_CODEC_INVALID_PARAM;
+    }
     input_string = strdup(DEFAULT_QUANTIZER_VALUES);
   } else {
     input_string = strdup(quantizer_values);
@@ -297,7 +311,12 @@
     } else {
       q = 0;
     }
-    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    if (is_keyframe) {
+      si->quantizer_keyframe[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers]
+      = q;
+    } else {
+      si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    }
   }
   if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
@@ -382,6 +401,7 @@
   char *option_name;
   char *option_value;
   char *input_ptr;
+  int is_keyframe_qaunt_set = 0;
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (options == NULL) return VPX_CODEC_OK;
@@ -407,8 +427,17 @@
       res = parse_scale_factors(svc_ctx, option_value);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value);
+      res = parse_quantizer_values(svc_ctx, option_value, 0);
       if (res != VPX_CODEC_OK) break;
+      if (!is_keyframe_qaunt_set) {
+        SvcInternal *const si = get_svc_internal(svc_ctx);
+        memcpy(get_svc_internal(svc_ctx)->quantizer_keyframe, si->quantizer,
+               sizeof(si->quantizer));
+      }
+    } else if (strcmp("quantizers-keyframe", option_name) == 0) {
+      res = parse_quantizer_values(svc_ctx, option_value, 1);
+      if (res != VPX_CODEC_OK) break;
+      is_keyframe_qaunt_set = 1;
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -431,13 +460,19 @@
 }
 
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers) {
+                                       const char *quantizers,
+                                       const int is_for_keyframe) {
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  if (is_for_keyframe) {
+    strncpy(si->quantizers_keyframe, quantizers, sizeof(si->quantizers));
+    si->quantizers_keyframe[sizeof(si->quantizers_keyframe) - 1] = '\0';
+  } else {
+    strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+    si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  }
   return VPX_CODEC_OK;
 }
 
@@ -488,9 +523,13 @@
   // for first frame
   si->layers = svc_ctx->spatial_layers;
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers);
+  res = parse_quantizer_values(svc_ctx, si->quantizers, 0);
   if (res != VPX_CODEC_OK) return res;
 
+  res = parse_quantizer_values(svc_ctx, si->quantizers_keyframe, 1);
+  if (res != VPX_CODEC_OK)
+    memcpy(si->quantizer_keyframe, si->quantizer, sizeof(si->quantizer));
+
   res = parse_scale_factors(svc_ctx, si->scale_factors);
   if (res != VPX_CODEC_OK) return res;
 
@@ -498,6 +537,29 @@
   res = parse_options(svc_ctx, si->options);
   if (res != VPX_CODEC_OK) return res;
 
+  // Assign target bitrate for each layer. We calculate the ratio
+  // from the resolution for now.
+  // TODO(Minghai): Optimize the mechanism of allocating bits after
+  // implementing svc two pass rate control.
+  if (si->layers > 1) {
+    int i;
+    float total = 0;
+    float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
+
+    for (i = 0; i < si->layers; ++i) {
+      int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers;
+      alloc_ratio[i] = si->scaling_factor_num[pos] * 1.0 /
+                       si->scaling_factor_den[pos];
+      alloc_ratio[i] *= alloc_ratio[i];
+      total += alloc_ratio[i];
+    }
+
+    for (i = 0; i < si->layers; ++i) {
+      enc_cfg->ss_target_bitrate[i] = enc_cfg->rc_target_bitrate *
+          alloc_ratio[i] / total;
+    }
+  }
+
   // modify encoder configuration
   enc_cfg->ss_number_layers = si->layers;
   enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
@@ -711,8 +773,15 @@
     svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
   }
   layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
-  svc_params.min_quantizer = si->quantizer[layer_index];
-  svc_params.max_quantizer = si->quantizer[layer_index];
+
+  if (vpx_svc_is_keyframe(svc_ctx)) {
+    svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
+    svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
+  } else {
+    svc_params.min_quantizer = si->quantizer[layer_index];
+    svc_params.max_quantizer = si->quantizer[layer_index];
+  }
+
   svc_params.distance_from_i_frame = si->frame_within_gop;
 
   // Use buffer i for layer i LST
@@ -812,7 +881,7 @@
       switch (cx_pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
           const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz);
-          si->bytes_in_layer[si->layer] += frame_pkt_size;
+          si->bytes_sum[si->layer] += frame_pkt_size;
           svc_log(svc_ctx, SVC_LOG_DEBUG,
                   "SVC frame: %d, layer: %d, size: %u\n",
                   si->encode_frame_count, si->layer, frame_pkt_size);
@@ -830,13 +899,23 @@
           break;
         }
         case VPX_CODEC_PSNR_PKT: {
+          int i;
           svc_log(svc_ctx, SVC_LOG_DEBUG,
                   "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
                   "%2.3f  %2.3f  %2.3f  %2.3f \n",
                   si->encode_frame_count, si->layer,
                   cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
                   cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-          si->psnr_in_layer[si->layer] += cx_pkt->data.psnr.psnr[0];
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->encode_frame_count, si->layer,
+                  cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
+                  cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
+          for (i = 0; i < COMPONENTS; i++) {
+            si->psnr_sum[si->layer][i] += cx_pkt->data.psnr.psnr[i];
+            si->sse_sum[si->layer][i] += cx_pkt->data.psnr.sse[i];
+          }
           break;
         }
         default: {
@@ -914,11 +993,21 @@
   si->frame_within_gop = 0;
 }
 
+static double calc_psnr(double d) {
+  if (d == 0) return 100;
+  return -10.0 * log(d) / log(10.0);
+}
+
 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   int number_of_frames, number_of_keyframes, encode_frame_count;
-  int i;
+  int i, j;
   uint32_t bytes_total = 0;
+  double scale[COMPONENTS];
+  double psnr[COMPONENTS];
+  double mse[COMPONENTS];
+  double y_scale;
+
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
 
@@ -936,12 +1025,36 @@
         (i == 1 || i == 3)) {
       number_of_frames -= number_of_keyframes;
     }
-    svc_log(svc_ctx, SVC_LOG_INFO, "Layer %d PSNR=[%2.3f], Bytes=[%u]\n", i,
-            (double)si->psnr_in_layer[i] / number_of_frames,
-            si->bytes_in_layer[i]);
-    bytes_total += si->bytes_in_layer[i];
-    si->psnr_in_layer[i] = 0;
-    si->bytes_in_layer[i] = 0;
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
+            i, (double)si->psnr_sum[i][0] / number_of_frames,
+            (double)si->psnr_sum[i][1] / number_of_frames,
+            (double)si->psnr_sum[i][2] / number_of_frames,
+            (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
+    // the following psnr calculation is deduced from ffmpeg.c#print_report
+    y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames;
+    scale[1] = y_scale;
+    scale[2] = scale[3] = y_scale / 4;  // U or V
+    scale[0] = y_scale * 1.5;           // total
+
+    for (j = 0; j < COMPONENTS; j++) {
+      psnr[j] = calc_psnr(si->sse_sum[i][j] / scale[j]);
+      mse[j] = si->sse_sum[i][j] * 255.0 * 255.0 / scale[j];
+    }
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall PSNR=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, psnr[0],
+            psnr[1], psnr[2], psnr[3]);
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall MSE=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, mse[0],
+            mse[1], mse[2], mse[3]);
+
+    bytes_total += si->bytes_sum[i];
+    // clear sums for next time
+    si->bytes_sum[i] = 0;
+    for (j = 0; j < COMPONENTS; ++j) {
+      si->psnr_sum[i][j] = 0;
+      si->sse_sum[i][j] = 0;
+    }
   }
 
   // only display statistics once
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index a99e48f..63fdaf3 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -226,3 +226,21 @@
 
   return SAVE_STATUS(ctx, res);
 }
+
+vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+    vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = VPX_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = VPX_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
diff --git a/vpx/src/vpx_psnr.c b/vpx/src/vpx_psnr.c
new file mode 100644
index 0000000..05843ac
--- /dev/null
+++ b/vpx/src/vpx_psnr.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx/internal/vpx_psnr.h"
+
+#define MAX_PSNR 100.0
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index f675fb6..98474ca 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -64,7 +64,8 @@
  * e.g., "60,53,39,33,27"
  */
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizer_values);
+                                       const char *quantizer_values,
+                                       const int is_for_keyframe);
 
 /**
  * Set SVC scale factors
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d0ac1af..0b637d4 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -195,6 +195,11 @@
 
   VP9E_SET_SVC,
   VP9E_SET_SVC_PARAMETERS,
+  /*!\brief control function to set svc layer for spatial and temporal.
+   * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial
+   *                     layer and 0..#vpx_codec_enc_cfg::ts_number_layers for
+   *                     temporal layer.
+   */
   VP9E_SET_SVC_LAYER_ID
 };
 
@@ -297,9 +302,16 @@
   int alt_fb_idx;             /**< alt reference frame frame buffer index */
 } vpx_svc_parameters_t;
 
+/*!\brief  vp9 svc layer parameters
+ *
+ * This defines the spatial and temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and
+ * temporal layer id for the current frame.
+ *
+ */
 typedef struct vpx_svc_layer_id {
-  int spatial_layer_id;
-  int temporal_layer_id;
+  int spatial_layer_id;       /**< Spatial layer id number. */
+  int temporal_layer_id;      /**< Temporal layer id number. */
 } vpx_svc_layer_id_t;
 
 /*!\brief VP8 encoder control function parameter type
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 111c87e..98d1d56 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -34,8 +34,10 @@
 API_SRCS-yes                += src/vpx_encoder.c
 API_SRCS-yes                += vpx_encoder.h
 API_SRCS-yes                += internal/vpx_codec_internal.h
+API_SRCS-yes                += internal/vpx_psnr.h
 API_SRCS-yes                += src/vpx_codec.c
 API_SRCS-yes                += src/vpx_image.c
+API_SRCS-yes                += src/vpx_psnr.c
 API_SRCS-yes                += vpx_codec.h
 API_SRCS-yes                += vpx_codec.mk
 API_SRCS-yes                += vpx_frame_buffer.h
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 7356bae..ba18328 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -30,6 +30,7 @@
 #endif
 
 #include "./vpx_codec.h"
+#include "./vpx_frame_buffer.h"
 
   /*!\brief Current ABI version number
    *
@@ -39,7 +40,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_DECODER_ABI_VERSION (2 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
   /*! \brief Decoder capabilities bitfield
    *
@@ -66,6 +67,8 @@
    */
 #define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
                                                       multi-threading */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external
+                                                          frame buffers */
 
 #define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
 #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
@@ -326,6 +329,51 @@
 
   /*!@} - end defgroup cap_put_slice*/
 
+  /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+   *
+   * The following section is required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+   * Calling this function for codecs that don't advertise this capability
+   * will result in an error code being returned, usually VPX_CODEC_ERROR.
+   *
+   * \note
+   * Currently this only works with VP9.
+   * @{
+   */
+
+  /*!\brief Pass in external frame buffers for the decoder to use.
+   *
+   * Registers functions to be called when libvpx needs a frame buffer
+   * to decode the current frame and a function to be called when libvpx does
+   * not internally reference the frame buffer. This set function must
+   * be called before the first call to decode or libvpx will assume the
+   * default behavior of allocating frame buffers internally.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] cb_get       Pointer to the get callback function
+   * \param[in] cb_release   Pointer to the release callback function
+   * \param[in] cb_priv      Callback's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     External frame buffers will be used by libvpx.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     One or more of the callbacks were NULL.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     using external frame buffers.
+   *
+   * \note
+   * When decoding VP9, the application may be required to pass in at least
+   * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+   * buffers.
+   */
+  vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+      vpx_codec_ctx_t *ctx,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+  /*!@} - end defgroup cap_external_frame_buffer */
+
   /*!@} - end defgroup decoder*/
 #ifdef __cplusplus
 }
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 1d9f0c9..851ff1a 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -610,6 +610,13 @@
      */
     unsigned int           ss_number_layers;
 
+    /*!\brief Target bitrate for each spatial layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * spatial layer.
+     */
+    unsigned int           ss_target_bitrate[VPX_SS_MAX_LAYERS];
+
     /*!\brief Number of temporal coding layers.
      *
      * This value specifies the number of temporal layers to be used.
diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index b5489b4..e69df4b 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -11,6 +11,10 @@
 #ifndef VPX_VPX_FRAME_BUFFER_H_
 #define VPX_VPX_FRAME_BUFFER_H_
 
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -45,8 +49,9 @@
  * decoder needs a frame buffer to decode a compressed image into. This
  * function may be called more than once for every call to vpx_codec_decode.
  * The application may set fb->priv to some data which will be passed
- * back in the ximage and the release function call. On success the callback
- * must return 0. Any failure the callback must return a value less than 0.
+ * back in the ximage and the release function call. |fb| is guaranteed to
+ * not be NULL. On success the callback must return 0. Any failure the
+ * callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] new_size     Size in bytes needed by the buffer
@@ -58,8 +63,9 @@
 /*!\brief release frame buffer callback prototype
  *
  * This callback is invoked by the decoder when the frame buffer is not
- * referenced by any other buffers. On success the callback must return 0.
- * Any failure the callback must return a value less than 0.
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] fb           Pointer to vpx_codec_frame_buffer_t
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index d27325c..8d0f4ec 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -28,7 +28,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (1) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
@@ -139,6 +139,8 @@
     unsigned char *img_data;       /**< private */
     int      img_data_owner; /**< private */
     int      self_allocd;    /**< private */
+
+    void    *fb_priv; /**< Frame buffer data associated with the image. */
   } vpx_image_t; /**< alias for struct vpx_image */
 
   /**\brief Representation of a rectangle on a surface */
diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index da7c65d..24743c8 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h
@@ -11,6 +11,8 @@
 #ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_
 #define VPX_PORTS_MEM_OPS_ALIGNED_H_
 
+#include "vpx/vpx_integer.h"
+
 /* \file
  * \brief Provides portable memory access primitives for operating on aligned
  *        data
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index ab0a30a..5e95d31 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -60,7 +60,7 @@
     const int frame_size = yplane_size + 2 * uvplane_size;
 
     if (!ybf->buffer_alloc) {
-      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
       ybf->buffer_alloc_sz = frame_size;
     }
 
@@ -180,12 +180,12 @@
       // removed if border is totally removed.
       vpx_memset(fb->data, 0, fb->size);
 
-      ybf->buffer_alloc = yv12_align_addr(fb->data, 32);
+      ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
     } else if (frame_size > ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
       if (ybf->buffer_alloc)
         vpx_free(ybf->buffer_alloc);
-      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
       if (!ybf->buffer_alloc)
         return -1;
 
diff --git a/vpx_scale/vpx_scale_rtcd.sh b/vpx_scale/vpx_scale_rtcd.sh
index a5faf11..1d02b69 100644
--- a/vpx_scale/vpx_scale_rtcd.sh
+++ b/vpx_scale/vpx_scale_rtcd.sh
@@ -6,7 +6,7 @@
 forward_decls vpx_scale_forward_decls
 
 # Scaler functions
-if [ "CONFIG_SPATIAL_RESAMPLING" != "yes" ]; then
+if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
     prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
     prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
     prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 525f3a0..cdde75c 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -24,62 +24,60 @@
 #define VP9_ENC_BORDER_IN_PIXELS    160
 #define VP9_DEC_BORDER_IN_PIXELS    32
 
-  typedef struct yv12_buffer_config {
-    int   y_width;
-    int   y_height;
-    int   y_crop_width;
-    int   y_crop_height;
-    int   y_stride;
-    /*    int   yinternal_width; */
+typedef struct yv12_buffer_config {
+  int   y_width;
+  int   y_height;
+  int   y_crop_width;
+  int   y_crop_height;
+  int   y_stride;
 
-    int   uv_width;
-    int   uv_height;
-    int   uv_crop_width;
-    int   uv_crop_height;
-    int   uv_stride;
-    /*    int   uvinternal_width; */
+  int   uv_width;
+  int   uv_height;
+  int   uv_crop_width;
+  int   uv_crop_height;
+  int   uv_stride;
 
-    int   alpha_width;
-    int   alpha_height;
-    int   alpha_stride;
+  int   alpha_width;
+  int   alpha_height;
+  int   alpha_stride;
 
-    uint8_t *y_buffer;
-    uint8_t *u_buffer;
-    uint8_t *v_buffer;
-    uint8_t *alpha_buffer;
+  uint8_t *y_buffer;
+  uint8_t *u_buffer;
+  uint8_t *v_buffer;
+  uint8_t *alpha_buffer;
 
-    uint8_t *buffer_alloc;
-    int buffer_alloc_sz;
-    int border;
-    int frame_size;
+  uint8_t *buffer_alloc;
+  int buffer_alloc_sz;
+  int border;
+  int frame_size;
 
-    int corrupted;
-    int flags;
-  } YV12_BUFFER_CONFIG;
+  int corrupted;
+  int flags;
+} YV12_BUFFER_CONFIG;
 
-  int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                int width, int height, int border);
+int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                   int width, int height, int border);
-  int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
-                                    int width, int height, int border);
-  int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
-  int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height, int ss_x, int ss_y,
+                           int border);
+
+// Updates the yv12 buffer config with the frame buffer. If cb is not
+// NULL, then libvpx is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libvpx will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libvpx will allocate memory
+// internally to decode the current frame. Returns 0 on success. Returns < 0
+// on failure.
+int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height, int ss_x, int ss_y,
-                             int border);
-
-  // Updates the yv12 buffer config with the frame buffer. If cb is not
-  // NULL, then libvpx is using the frame buffer callbacks to handle memory.
-  // If cb is not NULL, libvpx will call cb with minimum size in bytes needed
-  // to decode the current frame. If cb is NULL, libvpx will allocate memory
-  // internally to decode the current frame. Returns 0 on success. Returns < 0
-  // on failure.
-  int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
-                               int width, int height, int ss_x, int ss_y,
-                               int border,
-                               vpx_codec_frame_buffer_t *fb,
-                               vpx_get_frame_buffer_cb_fn_t cb,
-                               void *cb_priv);
-  int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+                             int border,
+                             vpx_codec_frame_buffer_t *fb,
+                             vpx_get_frame_buffer_cb_fn_t cb,
+                             void *cb_priv);
+int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
 }
diff --git a/vpxdec.c b/vpxdec.c
index e85c4fa..b69e55e 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -23,6 +23,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
 
 #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
@@ -75,6 +76,8 @@
 static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
                                             "Scale output frames uniformly");
 
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
 
 static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
                                         "Compute the MD5 sum of the decoded frame");
@@ -82,7 +85,7 @@
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg, &scalearg,
+  &threadsarg, &verbosearg, &scalearg, &fb_arg,
   &md5arg,
   &error_concealment,
   NULL
@@ -296,10 +299,73 @@
   return is_raw;
 }
 
-void show_progress(int frame_in, int frame_out, unsigned long dx_time) {
-  fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\r",
+void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+  fprintf(stderr,
+          "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
-          (float)frame_out * 1000000.0 / (float)dx_time);
+          (double)frame_out * 1000000.0 / (double)dx_time);
+}
+
+struct ExternalFrameBuffer {
+  uint8_t* data;
+  size_t size;
+  int in_use;
+};
+
+struct ExternalFrameBufferList {
+  int num_external_frame_buffers;
+  struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  struct ExternalFrameBufferList *const ext_fb_list =
+      (struct ExternalFrameBufferList *)cb_priv;
+  if (ext_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+    if (!ext_fb_list->ext_fb[i].in_use)
+      break;
+  }
+
+  if (i == ext_fb_list->num_external_frame_buffers)
+    return -1;
+
+  if (ext_fb_list->ext_fb[i].size < min_size) {
+    free(ext_fb_list->ext_fb[i].data);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)malloc(min_size);
+    if (!ext_fb_list->ext_fb[i].data)
+      return -1;
+
+    ext_fb_list->ext_fb[i].size = min_size;
+  }
+
+  fb->data = ext_fb_list->ext_fb[i].data;
+  fb->size = ext_fb_list->ext_fb[i].size;
+  ext_fb_list->ext_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the external frame buffer.
+  fb->priv = &ext_fb_list->ext_fb[i];
+  return 0;
+}
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+int release_vp9_frame_buffer(void *cb_priv,
+                             vpx_codec_frame_buffer_t *fb) {
+  struct ExternalFrameBuffer *const ext_fb =
+      (struct ExternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  ext_fb->in_use = 0;
+  return 0;
 }
 
 void generate_filename(const char *pattern, char *out, size_t q_len,
@@ -418,6 +484,7 @@
 int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
+  int                    i;
   uint8_t               *buf = NULL;
   size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
@@ -428,7 +495,7 @@
   int                    ec_enabled = 0;
   const VpxInterface *interface = NULL;
   const VpxInterface *fourcc_interface = NULL;
-  unsigned long          dx_time = 0;
+  uint64_t dx_time = 0;
   struct arg               arg;
   char                   **argv, **argi, **argj;
 
@@ -447,6 +514,8 @@
   int                     do_scale = 0;
   vpx_image_t             *scaled_img = NULL;
   int                     frame_avail, got_data;
+  int                     num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = {0};
 
   const char *outfile_pattern = NULL;
   char outfile_name[PATH_MAX] = {0};
@@ -505,6 +574,8 @@
       quiet = 0;
     else if (arg_match(&arg, &scalearg, argi))
       do_scale = 1;
+    else if (arg_match(&arg, &fb_arg, argi))
+      num_external_frame_buffers = arg_parse_uint(&arg);
 
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -691,6 +762,19 @@
     arg_skip--;
   }
 
+  if (num_external_frame_buffers > 0) {
+    ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+    ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+        num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (vpx_codec_set_frame_buffer_functions(
+            &decoder, get_vp9_frame_buffer, release_vp9_frame_buffer,
+            &ext_fb_list)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+
   frame_avail = 1;
   got_data = 0;
 
@@ -709,7 +793,8 @@
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
+                             NULL, 0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s",
                frame_in, vpx_codec_error(&decoder));
@@ -720,7 +805,7 @@
         }
 
         vpx_usec_timer_mark(&timer);
-        dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
+        dx_time += vpx_usec_timer_elapsed(&timer);
       }
     }
 
@@ -791,7 +876,7 @@
                                         vpx_input_ctx.height,
                                         &vpx_input_ctx.framerate, img->fmt);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, len);
+              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
             } else {
               fputs(buf, outfile);
             }
@@ -800,7 +885,7 @@
           // Y4M frame header
           len = y4m_write_frame_header(buf, sizeof(buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, len);
+            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
           } else {
             fputs(buf, outfile);
           }
@@ -863,6 +948,11 @@
 
   if (scaled_img) vpx_img_free(scaled_img);
 
+  for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+    free(ext_fb_list.ext_fb[i].data);
+  }
+  free(ext_fb_list.ext_fb);
+
   fclose(infile);
   free(argv);
 
diff --git a/vpxenc.c b/vpxenc.c
index 73b3144..c61d83e 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -231,6 +231,10 @@
 static const arg_def_t disable_warning_prompt =
     ARG_DEF("y", "disable-warning-prompt", 0,
             "Display warnings, but do not prompt user to continue.");
+static const arg_def_t experimental_bitstream =
+    ARG_DEF(NULL, "experimental-bitstream", 0,
+            "Allow experimental bitstream features.");
+
 
 static const arg_def_t *main_args[] = {
   &debugmode,
@@ -713,21 +717,12 @@
       global->disable_warnings = 1;
     else if (arg_match(&arg, &disable_warning_prompt, argi))
       global->disable_warning_prompt = 1;
+    else if (arg_match(&arg, &experimental_bitstream, argi))
+      global->experimental_bitstream = 1;
     else
       argj++;
   }
 
-  /* Validate global config */
-  if (global->passes == 0) {
-#if CONFIG_VP9_ENCODER
-    // Make default VP9 passes = 2 until there is a better quality 1-pass
-    // encoder
-    global->passes = strcmp(global->codec->name, "vp9") == 0 ? 2 : 1;
-#else
-    global->passes = 1;
-#endif
-  }
-
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
     if (global->pass > global->passes) {
@@ -736,6 +731,23 @@
       global->passes = global->pass;
     }
   }
+  /* Validate global config */
+  if (global->passes == 0) {
+#if CONFIG_VP9_ENCODER
+    // Make default VP9 passes = 2 until there is a better quality 1-pass
+    // encoder
+    global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                      global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+#else
+    global->passes = 1;
+#endif
+  }
+
+  if (global->deadline == VPX_DL_REALTIME &&
+      global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
 }
 
 
@@ -826,6 +838,10 @@
 
     /* Allows removal of the application version from the EBML tags */
     stream->ebml.debug = global->debug;
+
+    /* Default lag_in_frames is 0 in realtime mode */
+    if (global->deadline == VPX_DL_REALTIME)
+      stream->config.cfg.g_lag_in_frames = 0;
   }
 
   /* Output files must be specified for each stream */
@@ -874,59 +890,63 @@
       continue;
     }
 
-    if (0);
-    else if (arg_match(&arg, &outputfile, argi))
+    if (0) {
+    } else if (arg_match(&arg, &outputfile, argi)) {
       config->out_fn = arg.val;
-    else if (arg_match(&arg, &fpf_name, argi))
+    } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-    else if (arg_match(&arg, &use_ivf, argi))
+    } else if (arg_match(&arg, &use_ivf, argi)) {
       config->write_webm = 0;
-    else if (arg_match(&arg, &threads, argi))
+    } else if (arg_match(&arg, &threads, argi)) {
       config->cfg.g_threads = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &profile, argi))
+    } else if (arg_match(&arg, &profile, argi)) {
       config->cfg.g_profile = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &width, argi))
+    } else if (arg_match(&arg, &width, argi)) {
       config->cfg.g_w = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &height, argi))
+    } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &stereo_mode, argi))
+    } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &timebase, argi)) {
+    } else if (arg_match(&arg, &timebase, argi)) {
       config->cfg.g_timebase = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
-    } else if (arg_match(&arg, &error_resilient, argi))
+    } else if (arg_match(&arg, &error_resilient, argi)) {
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &lag_in_frames, argi))
+    } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &dropframe_thresh, argi))
+      if (global->deadline == VPX_DL_REALTIME &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
+    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_allowed, argi))
+    } else if (arg_match(&arg, &resize_allowed, argi)) {
       config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_up_thresh, argi))
+    } else if (arg_match(&arg, &resize_up_thresh, argi)) {
       config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_down_thresh, argi))
+    } else if (arg_match(&arg, &resize_down_thresh, argi)) {
       config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &end_usage, argi))
+    } else if (arg_match(&arg, &end_usage, argi)) {
       config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &target_bitrate, argi))
+    } else if (arg_match(&arg, &target_bitrate, argi)) {
       config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &min_quantizer, argi))
+    } else if (arg_match(&arg, &min_quantizer, argi)) {
       config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &max_quantizer, argi))
+    } else if (arg_match(&arg, &max_quantizer, argi)) {
       config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &undershoot_pct, argi))
+    } else if (arg_match(&arg, &undershoot_pct, argi)) {
       config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &overshoot_pct, argi))
+    } else if (arg_match(&arg, &overshoot_pct, argi)) {
       config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_sz, argi))
+    } else if (arg_match(&arg, &buf_sz, argi)) {
       config->cfg.rc_buf_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_initial_sz, argi))
+    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
       config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_optimal_sz, argi))
+    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
       config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &bias_pct, argi)) {
-      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
-
+    } else if (arg_match(&arg, &bias_pct, argi)) {
+        config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &minsection_pct, argi)) {
@@ -939,16 +959,15 @@
 
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
-    } else if (arg_match(&arg, &kf_min_dist, argi))
+    } else if (arg_match(&arg, &kf_min_dist, argi)) {
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &kf_max_dist, argi)) {
+    } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
       config->have_kf_max_dist = 1;
-    } else if (arg_match(&arg, &kf_disabled, argi))
+    } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
-    else {
+    } else {
       int i, match = 0;
-
       for (i = 0; ctrl_args[i]; i++) {
         if (arg_match(&arg, ctrl_args[i], argi)) {
           int j;
@@ -972,12 +991,10 @@
 
         }
       }
-
       if (!match)
         argj++;
     }
   }
-
   return eos_mark_found;
 }
 
@@ -991,13 +1008,20 @@
   } while (0)
 
 
-static void validate_stream_config(struct stream_state *stream) {
-  struct stream_state *streami;
+static void validate_stream_config(const struct stream_state *stream,
+                                   const struct VpxEncoderConfig *global) {
+  const struct stream_state *streami;
 
   if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
     fatal("Stream %d: Specify stream dimensions with --width (-w) "
           " and --height (-h)", stream->index);
 
+  if (stream->config.cfg.g_profile != 0 && !global->experimental_bitstream) {
+    fatal("Stream %d: profile %d is experimental and requires the --%s flag",
+          stream->index, stream->config.cfg.g_profile,
+          experimental_bitstream.long_name);
+  }
+
   for (streami = stream; streami; streami = streami->next) {
     /* All streams require output files */
     if (!streami->config.out_fn)
@@ -1375,8 +1399,8 @@
     return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = vp8_mse2psnr((double)stream->psnr_samples_total, 255.0,
-                        (double)stream->psnr_sse_total);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, 255.0,
+                       (double)stream->psnr_sse_total);
   fprintf(stderr, " %.3f", ovpsnr);
 
   for (i = 0; i < 4; i++) {
@@ -1529,11 +1553,9 @@
   if (!input.filename)
     usage_exit();
 
-#if CONFIG_NON420
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == VP9_FOURCC)
     input.only_i420 = 0;
-#endif
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
@@ -1560,7 +1582,7 @@
       fatal("Specify stream dimensions with --width (-w) "
             " and --height (-h)");
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
-    FOREACH_STREAM(validate_stream_config(stream));
+    FOREACH_STREAM(validate_stream_config(stream, &global));
 
     /* Ensure that --passes and --pass are consistent. If --pass is set and
      * --passes=2, ensure --fpf was set.
diff --git a/vpxenc.h b/vpxenc.h
index 1e6acaa..a8c3722 100644
--- a/vpxenc.h
+++ b/vpxenc.h
@@ -46,6 +46,7 @@
   int show_rate_hist_buckets;
   int disable_warnings;
   int disable_warning_prompt;
+  int experimental_bitstream;
 };
 
 #ifdef __cplusplus
diff --git a/vpxstats.c b/vpxstats.c
index 70cea3e..5f88f8d 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -120,16 +120,3 @@
 vpx_fixed_buf_t stats_get(stats_io_t *stats) {
   return stats->buf;
 }
-
-double vp8_mse2psnr(double samples, double peak, double mse) {
-  const int kMaxPSNR = 100;
-  double psnr = kMaxPSNR;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-
-  if (psnr > kMaxPSNR)
-    psnr = kMaxPSNR;
-
-  return psnr;
-}
diff --git a/vpxstats.h b/vpxstats.h
index 9ce9c53..5c9ea34 100644
--- a/vpxstats.h
+++ b/vpxstats.h
@@ -36,8 +36,6 @@
 void stats_write(stats_io_t *stats, const void *pkt, size_t len);
 vpx_fixed_buf_t stats_get(stats_io_t *stats);
 
-double vp8_mse2psnr(double samples, double peak, double mse);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/webmdec.c b/webmdec.c
index fdcf3a5..7cacdf9 100644
--- a/webmdec.c
+++ b/webmdec.c
@@ -12,7 +12,7 @@
 
 #include <stdarg.h>
 
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 
 static int nestegg_read_cb(void *buffer, size_t length, void *userdata) {
   FILE *f = userdata;
@@ -65,7 +65,7 @@
   nestegg_video_params params;
 
   io.userdata = vpx_ctx->file;
-  if (nestegg_init(&webm_ctx->nestegg_ctx, io, NULL))
+  if (nestegg_init(&webm_ctx->nestegg_ctx, io, NULL, -1))
     goto fail;
 
   if (nestegg_track_count(webm_ctx->nestegg_ctx, &n))