Merge "Remove unused vp8_strict_quantize"
diff --git a/README b/README
index f9c24ff..6f864d8 100644
--- a/README
+++ b/README
@@ -65,6 +65,7 @@
     armv7-win32-vs12
     armv7s-darwin-gcc
     mips32-linux-gcc
+    mips64-linux-gcc
     ppc32-darwin8-gcc
     ppc32-darwin9-gcc
     ppc32-linux-gcc
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 3653309..7907225 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -245,13 +245,13 @@
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "\$(InputPath)""
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "\$(InputPath)""
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "\$(InputPath)""
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "\$(InputPath)""
     ;;
     *) die "Unsupported target $target!"
     ;;
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 23ef6a3..56b9a3b 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -253,13 +253,13 @@
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)""
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)""
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "%(FullPath)""
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "%(FullPath)""
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)""
-        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)""
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)""
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)""
     ;;
     arm*)
         asm_Debug_cmdline="armasm -nologo "%(FullPath)""
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 28ef69c..40bcb33 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -49,7 +49,7 @@
 
 my %config = ();
 while (<CONFIG_FILE>) {
-  next if !/^CONFIG_/;
+  next if !/^(?:CONFIG_|HAVE_)/;
   chomp;
   my @pair = split /=/;
   $config{$pair[0]} = $pair[1];
@@ -365,13 +365,13 @@
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
-} elsif ($opts{arch} eq 'mips32') {
-  @ALL_ARCHS = filter(qw/mips32/);
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  @ALL_ARCHS = filter("$opts{arch}");
   open CONFIG_FILE, $opts{config} or
     die "Error opening config file '$opts{config}': $!\n";
   while (<CONFIG_FILE>) {
     if (/HAVE_DSPR2=yes/) {
-      @ALL_ARCHS = filter(qw/mips32 dspr2/);
+      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
       last;
     }
   }
diff --git a/configure b/configure
index 92ca061..7b9c211 100755
--- a/configure
+++ b/configure
@@ -111,6 +111,7 @@
 all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
+all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
 all_platforms="${all_platforms} ppc32-linux-gcc"
@@ -254,6 +255,8 @@
     mips32
     dspr2
 
+    mips64
+
     mmx
     sse
     sse2
@@ -278,6 +281,7 @@
     spatial_svc
     vp9_temporal_denoising
     fp_mb_stats
+    emulate_hardware_highbitdepth
 "
 CONFIG_LIST="
     external_build
@@ -331,6 +335,7 @@
     multi_res_encoding
     temporal_denoising
     coefficient_range_checking
+    vp9_highbitdepth
     experimental
     size_limit
     ${EXPERIMENT_LIST}
@@ -389,6 +394,7 @@
     multi_res_encoding
     temporal_denoising
     coefficient_range_checking
+    vp9_highbitdepth
     experimental
 "
 
diff --git a/examples.mk b/examples.mk
index 91bd45a..fd67a44 100644
--- a/examples.mk
+++ b/examples.mk
@@ -31,6 +31,7 @@
                 third_party/libyuv/source/scale_common.cc \
                 third_party/libyuv/source/scale_mips.cc \
                 third_party/libyuv/source/scale_neon.cc \
+                third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_posix.cc \
                 third_party/libyuv/source/scale_win.cc \
 
@@ -113,7 +114,7 @@
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
 vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
-EXAMPLES-$(CONFIG_VP8_DECODER)     += simple_decoder.c
+EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
 simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
 simple_decoder.SRCS                += ivfdec.h ivfdec.c
 simple_decoder.SRCS                += tools_common.h tools_common.c
@@ -122,7 +123,7 @@
 simple_decoder.SRCS                += vpx_ports/mem_ops.h
 simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
-EXAMPLES-$(CONFIG_VP8_DECODER)     += postproc.c
+EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
@@ -131,7 +132,7 @@
 postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
-EXAMPLES-$(CONFIG_VP8_DECODER)     += decode_to_md5.c
+EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
 decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
 decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
@@ -141,29 +142,34 @@
 decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += simple_encoder.c
+EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
 simple_encoder.SRCS             += ivfenc.h ivfenc.c
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
 simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += twopass_encoder.c
+EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
+vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
+vp9_lossless_encoder.SRCS       += video_common.h
+vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
+vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
+EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
 twopass_encoder.SRCS            += ivfenc.h ivfenc.c
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
 twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
-ifeq ($(CONFIG_DECODERS),yes)
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += decode_with_drops.c
+EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
 decode_with_drops.SRCS          += ivfdec.h ivfdec.c
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
 decode_with_drops.SRCS          += vpx_ports/mem_ops.h
 decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
-endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
@@ -185,7 +191,9 @@
 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
 EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
+vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 1c56303..fbc0f4a 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -33,8 +33,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index a20fdac..9423e38 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -56,8 +56,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
diff --git a/examples/postproc.c b/examples/postproc.c
index 59c50b1..c74347c 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -43,8 +43,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
diff --git a/examples/set_maps.c b/examples/set_maps.c
index af8c582..851adc4 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -42,11 +42,11 @@
 // Use the `simple_decoder` example to decode this sample, and observe
 // the change in the image at frames 22, 33, and 44.
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
@@ -125,10 +125,11 @@
     die_codec(codec, "Failed to set active map");
 }
 
-static void encode_frame(vpx_codec_ctx_t *codec,
-                         vpx_image_t *img,
-                         int frame_index,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -137,6 +138,8 @@
     die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
       if (!vpx_video_writer_write_frame(writer,
@@ -150,6 +153,8 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
 }
 
 int main(int argc, char **argv) {
@@ -172,9 +177,10 @@
   memset(&info, 0, sizeof(info));
 
   encoder = get_vpx_encoder_by_name(argv[1]);
-  if (!encoder)
+  if (encoder == NULL) {
     die("Unsupported codec.");
-
+  }
+  assert(encoder != NULL);
   info.codec_fourcc = encoder->fourcc;
   info.frame_width = strtol(argv[2], NULL, 0);
   info.frame_height = strtol(argv[3], NULL, 0);
@@ -217,6 +223,7 @@
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
+  // Encode frames.
   while (vpx_img_read(&raw, infile)) {
     ++frame_count;
 
@@ -230,7 +237,10 @@
 
     encode_frame(&codec, &raw, frame_count, writer);
   }
-  encode_frame(&codec, NULL, -1, writer);
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {}
+
   printf("\n");
   fclose(infile);
   printf("Processed %d frames.\n", frame_count);
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index 3318758..3f7d6aa 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -29,9 +29,7 @@
 // -----------------
 // For decoders, you only have to include `vpx_decoder.h` and then any
 // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
 //
 // Initializing The Codec
 // ----------------------
@@ -81,8 +79,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 30bb73a..f20c246 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -28,9 +28,7 @@
 // -----------------
 // For encoders, you only have to include `vpx_encoder.h` and then any
 // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
 //
 // Getting The Default Configuration
 // ---------------------------------
@@ -101,7 +99,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 
 #include "./tools_common.h"
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 369b1d8..653ae94 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -28,9 +28,8 @@
 // Encoding A Frame
 // ----------------
 // Encoding a frame in two pass mode is identical to the simple encoder
-// example, except the deadline is set to VPX_DL_BEST_QUALITY to get the
-// best quality possible. VPX_DL_GOOD_QUALITY could also be used.
-//
+// example. To increase the quality while sacrificing encoding speed,
+// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY.
 //
 // Processing Statistics Packets
 // -----------------------------
@@ -52,7 +51,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 
 #include "./tools_common.h"
@@ -66,13 +64,14 @@
   exit(EXIT_FAILURE);
 }
 
-static void get_frame_stats(vpx_codec_ctx_t *ctx,
-                            const vpx_image_t *img,
-                            vpx_codec_pts_t pts,
-                            unsigned int duration,
-                            vpx_enc_frame_flags_t flags,
-                            unsigned int deadline,
-                            vpx_fixed_buf_t *stats) {
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+                           const vpx_image_t *img,
+                           vpx_codec_pts_t pts,
+                           unsigned int duration,
+                           vpx_enc_frame_flags_t flags,
+                           unsigned int deadline,
+                           vpx_fixed_buf_t *stats) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -81,6 +80,8 @@
     die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_STATS_PKT) {
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -89,15 +90,18 @@
       stats->sz += pkt_size;
     }
   }
+
+  return got_pkts;
 }
 
-static void encode_frame(vpx_codec_ctx_t *ctx,
-                         const vpx_image_t *img,
-                         vpx_codec_pts_t pts,
-                         unsigned int duration,
-                         vpx_enc_frame_flags_t flags,
-                         unsigned int deadline,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *ctx,
+                        const vpx_image_t *img,
+                        vpx_codec_pts_t pts,
+                        unsigned int duration,
+                        vpx_enc_frame_flags_t flags,
+                        unsigned int deadline,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -106,6 +110,7 @@
     die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
 
@@ -117,19 +122,90 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+                             FILE *infile,
+                             const VpxInterface *encoder,
+                             const vpx_codec_enc_cfg_t *cfg) {
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+  vpx_fixed_buf_t stats = {NULL, 0};
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
+                    &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+                         VPX_DL_GOOD_QUALITY, &stats)) {}
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+                  FILE *infile,
+                  const char *outfile_name,
+                  const VpxInterface *encoder,
+                  const vpx_codec_enc_cfg_t *cfg) {
+  VpxVideoInfo info = {
+    encoder->fourcc,
+    cfg->g_w,
+    cfg->g_h,
+    {cfg->g_timebase.num, cfg->g_timebase.den}
+  };
+  VpxVideoWriter *writer = NULL;
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing", outfile_name);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
+
+  printf("\n");
+
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
 }
 
 int main(int argc, char **argv) {
   FILE *infile = NULL;
-  VpxVideoWriter *writer = NULL;
+  int w, h;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  vpx_fixed_buf_t stats = {0};
-  VpxVideoInfo info = {0};
+  vpx_fixed_buf_t stats;
+
   const VpxInterface *encoder = NULL;
-  int pass;
   const int fps = 30;        // TODO(dkovalev) add command line argument
   const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
@@ -146,85 +222,44 @@
   if (!encoder)
     die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
-  info.time_base.numerator = 1;
-  info.time_base.denominator = fps;
-  info.frame_width = strtol(width_arg, NULL, 0);
-  info.frame_height = strtol(height_arg, NULL, 0);
+  w = strtol(width_arg, NULL, 0);
+  h = strtol(height_arg, NULL, 0);
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
-    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
-  }
+  if (w  <= 0 || h <= 0 || (w % 2) != 0 || (h  % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
 
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
-    die("Failed to allocate image", info.frame_width, info.frame_height);
-  }
-
-  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing", outfile_arg);
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
+  // Configuration
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
   if (res)
     die_codec(&codec, "Failed to get default codec config.");
 
-  cfg.g_w = info.frame_width;
-  cfg.g_h = info.frame_height;
-  cfg.g_timebase.num = info.time_base.numerator;
-  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
   cfg.rc_target_bitrate = bitrate;
 
-  for (pass = 0; pass < 2; ++pass) {
-    int frame_count = 0;
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
 
-    if (pass == 0) {
-      cfg.g_pass = VPX_RC_FIRST_PASS;
-    } else {
-      cfg.g_pass = VPX_RC_LAST_PASS;
-      cfg.rc_twopass_stats_in = stats;
-    }
+  // Pass 0
+  cfg.g_pass = VPX_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg);
 
-    if (!(infile = fopen(infile_arg, "rb")))
-      die("Failed to open %s for reading", infile_arg);
-
-    if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-      die_codec(&codec, "Failed to initialize encoder");
-
-    while (vpx_img_read(&raw, infile)) {
-      ++frame_count;
-
-      if (pass == 0) {
-        get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                        &stats);
-      } else {
-        encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                     writer);
-      }
-    }
-
-    if (pass == 0) {
-      get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                      &stats);
-    } else {
-      printf("\n");
-    }
-
-    fclose(infile);
-    printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count);
-    if (vpx_codec_destroy(&codec))
-      die_codec(&codec, "Failed to destroy codec.");
-  }
-
-  vpx_img_free(&raw);
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = VPX_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg);
   free(stats.buf);
 
-  vpx_video_writer_close(writer);
+  vpx_img_free(&raw);
+  fclose(infile);
 
   return EXIT_SUCCESS;
 }
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index d41e442..9f50dc7 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -8,446 +8,292 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-/*
- * This is an example demonstrating multi-resolution encoding in VP8.
- * High-resolution input video is down-sampled to lower-resolutions. The
- * encoder then encodes the video and outputs multiple bitstreams with
- * different resolutions.
- */
+
+// This is an example demonstrating multi-resolution encoding in VP8.
+// High-resolution input video is down-sampled to lower-resolutions. The
+// encoder then encodes the video and outputs multiple bitstreams with
+// different resolutions.
+//
+// Configure with --enable-multi-res-encoding flag to enable this example.
+
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <math.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
-#include "vpx_ports/mem_ops.h"
-#include "./tools_common.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc    0x30385056
 
-void usage_exit() {
-  exit(EXIT_FAILURE);
-}
-
-/*
- * The input video frame is downsampled several times to generate a multi-level
- * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
- * levels required. For example, if the size of input video is 1280x720,
- * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
- * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
- * 320x180(level 2) respectively.
- */
-#define NUM_ENCODERS 3
-
-/* This example uses the scaler function in libyuv. */
 #include "third_party/libyuv/include/libyuv/basic_types.h"
 #include "third_party/libyuv/include/libyuv/scale.h"
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 
-int (*read_frame_p)(FILE *f, vpx_image_t *img);
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
 
-static int read_frame(FILE *f, vpx_image_t *img) {
-    size_t nbytes, to_read;
-    int    res = 1;
+#include "./tools_common.h"
+#include "./video_writer.h"
 
-    to_read = img->w*img->h*3/2;
-    nbytes = fread(img->planes[0], 1, to_read, f);
-    if(nbytes != to_read) {
-        res = 0;
-        if(nbytes > 0)
-            printf("Warning: Read partial frame. Check your width & height!\n");
-    }
-    return res;
+// The input video frame is downsampled several times to generate a
+// multi-level  hierarchical structure. kNumEncoders is defined as the number
+// of encoding  levels required. For example, if the size of input video is
+// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
+// outputs 3 bitstreams with resolution of 1280x720(level 0),
+// 640x360(level 1), and 320x180(level 2) respectively.
+#define kNumEncoders 3
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr,
+          "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static int read_frame_by_row(FILE *f, vpx_image_t *img) {
-    size_t nbytes, to_read;
-    int    res = 1;
-    int plane;
+int main(int argc, char *argv[]) {
+  int frame_cnt = 0;
+  FILE *infile = NULL;
+  VpxVideoWriter *writers[kNumEncoders];
+  vpx_codec_ctx_t codec[kNumEncoders];
+  vpx_codec_enc_cfg_t cfg[kNumEncoders];
+  vpx_image_t raw[kNumEncoders];
+  const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8");
+  // Currently, only realtime mode is supported in multi-resolution encoding.
+  const int arg_deadline = VPX_DL_REALTIME;
+  int i;
+  int width = 0;
+  int height = 0;
+  int frame_avail = 0;
+  int got_data = 0;
 
-    for (plane = 0; plane < 3; plane++)
+  // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+  // don't need to know PSNR, which will skip PSNR calculation and save
+  // encoding time.
+  int show_psnr = 0;
+  uint64_t psnr_sse_total[kNumEncoders] = {0};
+  uint64_t psnr_samples_total[kNumEncoders] = {0};
+  double psnr_totals[kNumEncoders][4] = {{0, 0}};
+  int psnr_count[kNumEncoders] = {0};
+
+  // Set the required target bitrates for each resolution level.
+  // If target bitrate for highest-resolution level is set to 0,
+  // (i.e. target_bitrate[0]=0), we skip encoding at that level.
+  unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100};
+
+  // Enter the frame rate of the input video.
+  const int framerate = 30;
+  // Set down-sampling factor for each resolution level.
+  //   dsf[0] controls down sampling from level 0 to level 1;
+  //   dsf[1] controls down sampling from level 1 to level 2;
+  //   dsf[2] is not used.
+  vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}};
+
+  exec_name = argv[0];
+
+  if (!encoder)
+    die("Unsupported codec.");
+
+  // exe_name, input width, input height, input file,
+  // output file 1, output file 2, output file 3, psnr on/off
+  if (argc != (5 + kNumEncoders))
+    die("Invalid number of input options.");
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  width = strtol(argv[1], NULL, 0);
+  height = strtol(argv[2], NULL, 0);
+
+  if (width < 16 || width % 2 || height < 16 || height % 2)
+    die("Invalid resolution: %ldx%ld", width, height);
+
+  // Open input video file for encoding
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading", argv[3]);
+
+  show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0);
+
+  // Populate default encoder configuration
+  for (i = 0; i < kNumEncoders; ++i) {
+    vpx_codec_err_t res =
+        vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0);
+    if (res != VPX_CODEC_OK) {
+      printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+      return EXIT_FAILURE;
+    }
+  }
+
+  // Update the default configuration according to needs of the application.
+  // Highest-resolution encoder settings
+  cfg[0].g_w = width;
+  cfg[0].g_h = height;
+  cfg[0].g_threads = 1;
+  cfg[0].rc_dropframe_thresh = 30;
+  cfg[0].rc_end_usage = VPX_CBR;
+  cfg[0].rc_resize_allowed = 0;
+  cfg[0].rc_min_quantizer = 4;
+  cfg[0].rc_max_quantizer = 56;
+  cfg[0].rc_undershoot_pct = 98;
+  cfg[0].rc_overshoot_pct = 100;
+  cfg[0].rc_buf_initial_sz = 500;
+  cfg[0].rc_buf_optimal_sz = 600;
+  cfg[0].rc_buf_sz = 1000;
+  cfg[0].g_error_resilient = 1;
+  cfg[0].g_lag_in_frames = 0;
+  cfg[0].kf_mode = VPX_KF_AUTO;  // VPX_KF_DISABLED
+  cfg[0].kf_min_dist = 3000;
+  cfg[0].kf_max_dist = 3000;
+  cfg[0].rc_target_bitrate = target_bitrate[0];
+  cfg[0].g_timebase.num = 1;
+  cfg[0].g_timebase.den = framerate;
+
+  // Other-resolution encoder settings
+  for (i = 1; i < kNumEncoders; ++i) {
+    cfg[i] = cfg[0];
+    cfg[i].g_threads = 1;
+    cfg[i].rc_target_bitrate = target_bitrate[i];
+
+    // Note: Width & height of other-resolution encoders are calculated
+    // from the highest-resolution encoder's size and the corresponding
+    // down_sampling_factor.
     {
-        unsigned char *ptr;
-        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
-        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
-        int r;
+      unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1;
+      unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1;
+      cfg[i].g_w = iw / dsf[i - 1].num;
+      cfg[i].g_h = ih / dsf[i - 1].num;
+    }
 
-        /* Determine the correct plane based on the image format. The for-loop
-         * always counts in Y,U,V order, but this may not match the order of
-         * the data on disk.
-         */
-        switch (plane)
-        {
-        case 1:
-            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+    // Make width & height to be multiplier of 2.
+    if ((cfg[i].g_w) % 2)
+      cfg[i].g_w++;
+
+    if ((cfg[i].g_h) % 2)
+      cfg[i].g_h++;
+  }
+
+  // Open output file for each encoder to output bitstreams
+  for (i = 0; i < kNumEncoders; ++i) {
+    VpxVideoInfo info = {
+      encoder->fourcc,
+      cfg[i].g_w,
+      cfg[i].g_h,
+      {cfg[i].g_timebase.num, cfg[i].g_timebase.den}
+    };
+
+    if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info)))
+      die("Failed to open %s for writing", argv[i+4]);
+  }
+
+  // Allocate image for each encoder
+  for (i = 0; i < kNumEncoders; ++i)
+    if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+  // Initialize multi-encoder
+  if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0],
+                               kNumEncoders,
+                               show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0]))
+    die_codec(&codec[0], "Failed to initialize encoder");
+
+  // The extra encoding configuration parameters can be set as follows.
+  for (i = 0; i < kNumEncoders; i++) {
+    // Set encoding speed
+    if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6))
+      die_codec(&codec[i], "Failed to set cpu_used");
+
+    // Set static threshold.
+    if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+      die_codec(&codec[i], "Failed to set static threshold");
+
+    // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING
+    // Enable denoising for the highest-resolution encoder.
+    if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0))
+      die_codec(&codec[0], "Failed to set noise_sensitivity");
+  }
+
+  frame_avail = 1;
+  got_data = 0;
+
+  while (frame_avail || got_data) {
+    vpx_codec_iter_t iter[kNumEncoders] = {NULL};
+    const vpx_codec_cx_pkt_t *pkt[kNumEncoders];
+
+    frame_avail = vpx_img_read(&raw[0], infile);
+
+    if (frame_avail) {
+      for (i = 1; i < kNumEncoders; ++i) {
+        vpx_image_t *const prev = &raw[i - 1];
+
+        // Scale the image down a number of times by downsampling factor
+        // FilterMode 1 or 2 give better psnr than FilterMode 0.
+        I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y],
+                  prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U],
+                  prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V],
+                  prev->d_w, prev->d_h,
+                  raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+                  raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+                  raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+                  raw[i].d_w, raw[i].d_h, 1);
+      }
+    }
+
+    // Encode frame.
+    if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+                         frame_cnt, 1, 0, arg_deadline)) {
+      die_codec(&codec[0], "Failed to encode frame");
+    }
+
+    for (i = kNumEncoders - 1; i >= 0; i--) {
+      got_data = 0;
+
+      while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) {
+        got_data = 1;
+        switch (pkt[i]->kind) {
+          case VPX_CODEC_CX_FRAME_PKT:
+            vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf,
+                                         pkt[i]->data.frame.sz, frame_cnt - 1);
+          break;
+          case VPX_CODEC_PSNR_PKT:
+            if (show_psnr) {
+              int j;
+              psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+              psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+              for (j = 0; j < 4; j++)
+                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+              psnr_count[i]++;
+            }
             break;
-        case 2:
-            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+          default:
             break;
-        default:
-            ptr = img->planes[plane];
         }
+        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
+               (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+        fflush(stdout);
+      }
+    }
+    frame_cnt++;
+  }
+  printf("\n");
 
-        for (r = 0; r < h; r++)
-        {
-            to_read = w;
+  fclose(infile);
 
-            nbytes = fread(ptr, 1, to_read, f);
-            if(nbytes != to_read) {
-                res = 0;
-                if(nbytes > 0)
-                    printf("Warning: Read partial frame. Check your width & height!\n");
-                break;
-            }
+  printf("Processed %d frames.\n", frame_cnt - 1);
+  for (i = 0; i < kNumEncoders; ++i) {
+    // Calculate PSNR and print it out
+    if (show_psnr && psnr_count[i] > 0) {
+      int j;
+      double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+                                  psnr_sse_total[i]);
 
-            ptr += img->stride[plane];
-        }
-        if (!res)
-            break;
+      fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+      fprintf(stderr, " %.3lf", ovpsnr);
+      for (j = 0; j < 4; j++)
+        fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
     }
 
-    return res;
-}
+    if (vpx_codec_destroy(&codec[i]))
+      die_codec(&codec[i], "Failed to destroy codec");
 
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  int frame_cnt) {
-    char header[32];
+    vpx_img_free(&raw[i]);
+    vpx_video_writer_close(writers[i]);
+  }
+  printf("\n");
 
-    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-        return;
-    header[0] = 'D';
-    header[1] = 'K';
-    header[2] = 'I';
-    header[3] = 'F';
-    mem_put_le16(header+4,  0);                   /* version */
-    mem_put_le16(header+6,  32);                  /* headersize */
-    mem_put_le32(header+8,  fourcc);              /* headersize */
-    mem_put_le16(header+12, cfg->g_w);            /* width */
-    mem_put_le16(header+14, cfg->g_h);            /* height */
-    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
-    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
-    mem_put_le32(header+24, frame_cnt);           /* length */
-    mem_put_le32(header+28, 0);                   /* unused */
-
-    (void) fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt)
-{
-    char             header[12];
-    vpx_codec_pts_t  pts;
-
-    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-        return;
-
-    pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
-    mem_put_le32(header+4, pts&0xFFFFFFFF);
-    mem_put_le32(header+8, pts >> 32);
-
-    (void) fwrite(header, 1, 12, outfile);
-}
-
-int main(int argc, char **argv)
-{
-    FILE                *infile, *outfile[NUM_ENCODERS];
-    vpx_codec_ctx_t      codec[NUM_ENCODERS];
-    vpx_codec_enc_cfg_t  cfg[NUM_ENCODERS];
-    vpx_codec_pts_t      frame_cnt = 0;
-    vpx_image_t          raw[NUM_ENCODERS];
-    vpx_codec_err_t      res[NUM_ENCODERS];
-
-    int                  i;
-    long                 width;
-    long                 height;
-    int                  frame_avail;
-    int                  got_data;
-    int                  flags = 0;
-
-    /*Currently, only realtime mode is supported in multi-resolution encoding.*/
-    int                  arg_deadline = VPX_DL_REALTIME;
-
-    /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
-       don't need to know PSNR, which will skip PSNR calculation and save
-       encoding time. */
-    int                  show_psnr = 0;
-    uint64_t             psnr_sse_total[NUM_ENCODERS] = {0};
-    uint64_t             psnr_samples_total[NUM_ENCODERS] = {0};
-    double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
-    int                  psnr_count[NUM_ENCODERS] = {0};
-
-    /* Set the required target bitrates for each resolution level.
-     * If target bitrate for highest-resolution level is set to 0,
-     * (i.e. target_bitrate[0]=0), we skip encoding at that level.
-     */
-    unsigned int         target_bitrate[NUM_ENCODERS]={1000, 500, 100};
-    /* Enter the frame rate of the input video */
-    int                  framerate = 30;
-    /* Set down-sampling factor for each resolution level.
-       dsf[0] controls down sampling from level 0 to level 1;
-       dsf[1] controls down sampling from level 1 to level 2;
-       dsf[2] is not used. */
-    vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
-
-    if(argc!= (5+NUM_ENCODERS))
-        die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
-            argv[0]);
-
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-
-    width = strtol(argv[1], NULL, 0);
-    height = strtol(argv[2], NULL, 0);
-
-    if(width < 16 || width%2 || height <16 || height%2)
-        die("Invalid resolution: %ldx%ld", width, height);
-
-    /* Open input video file for encoding */
-    if(!(infile = fopen(argv[3], "rb")))
-        die("Failed to open %s for reading", argv[3]);
-
-    /* Open output file for each encoder to output bitstreams */
-    for (i=0; i< NUM_ENCODERS; i++)
-    {
-        if(!target_bitrate[i])
-        {
-            outfile[i] = NULL;
-            continue;
-        }
-
-        if(!(outfile[i] = fopen(argv[i+4], "wb")))
-            die("Failed to open %s for writing", argv[i+4]);
-    }
-
-    show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
-
-    /* Populate default encoder configuration */
-    for (i=0; i< NUM_ENCODERS; i++)
-    {
-        res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
-        if(res[i]) {
-            printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
-            return EXIT_FAILURE;
-        }
-    }
-
-    /*
-     * Update the default configuration according to needs of the application.
-     */
-    /* Highest-resolution encoder settings */
-    cfg[0].g_w = width;
-    cfg[0].g_h = height;
-    cfg[0].g_threads = 1;                           /* number of threads used */
-    cfg[0].rc_dropframe_thresh = 30;
-    cfg[0].rc_end_usage = VPX_CBR;
-    cfg[0].rc_resize_allowed = 0;
-    cfg[0].rc_min_quantizer = 4;
-    cfg[0].rc_max_quantizer = 56;
-    cfg[0].rc_undershoot_pct = 98;
-    cfg[0].rc_overshoot_pct = 100;
-    cfg[0].rc_buf_initial_sz = 500;
-    cfg[0].rc_buf_optimal_sz = 600;
-    cfg[0].rc_buf_sz = 1000;
-    cfg[0].g_error_resilient = 1;              /* Enable error resilient mode */
-    cfg[0].g_lag_in_frames   = 0;
-
-    /* Disable automatic keyframe placement */
-    /* Note: These 3 settings are copied to all levels. But, except the lowest
-     * resolution level, all other levels are set to VPX_KF_DISABLED internally.
-     */
-    //cfg[0].kf_mode           = VPX_KF_DISABLED;
-    cfg[0].kf_mode           = VPX_KF_AUTO;
-    cfg[0].kf_min_dist = 3000;
-    cfg[0].kf_max_dist = 3000;
-
-    cfg[0].rc_target_bitrate = target_bitrate[0];       /* Set target bitrate */
-    cfg[0].g_timebase.num = 1;                          /* Set fps */
-    cfg[0].g_timebase.den = framerate;
-
-    /* Other-resolution encoder settings */
-    for (i=1; i< NUM_ENCODERS; i++)
-    {
-        memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
-
-        cfg[i].g_threads = 1;                       /* number of threads used */
-        cfg[i].rc_target_bitrate = target_bitrate[i];
-
-        /* Note: Width & height of other-resolution encoders are calculated
-         * from the highest-resolution encoder's size and the corresponding
-         * down_sampling_factor.
-         */
-        {
-            unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
-            unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
-            cfg[i].g_w = iw/dsf[i-1].num;
-            cfg[i].g_h = ih/dsf[i-1].num;
-        }
-
-        /* Make width & height to be multiplier of 2. */
-        // Should support odd size ???
-        if((cfg[i].g_w)%2)cfg[i].g_w++;
-        if((cfg[i].g_h)%2)cfg[i].g_h++;
-    }
-
-    /* Allocate image for each encoder */
-    for (i=0; i< NUM_ENCODERS; i++)
-        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
-
-    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
-        read_frame_p = read_frame;
-    else
-        read_frame_p = read_frame_by_row;
-
-    for (i=0; i< NUM_ENCODERS; i++)
-        if(outfile[i])
-            write_ivf_file_header(outfile[i], &cfg[i], 0);
-
-    /* Initialize multi-encoder */
-    if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
-                                (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
-        die_codec(&codec[0], "Failed to initialize encoder");
-
-    /* The extra encoding configuration parameters can be set as follows. */
-    /* Set encoding speed */
-    for ( i=0; i<NUM_ENCODERS; i++)
-    {
-        int speed = -6;
-        if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
-            die_codec(&codec[i], "Failed to set cpu_used");
-    }
-
-    /* Set static threshold. */
-    for ( i=0; i<NUM_ENCODERS; i++)
-    {
-        unsigned int static_thresh = 1;
-        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
-            die_codec(&codec[i], "Failed to set static threshold");
-    }
-
-    /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
-    /* Enable denoising for the highest-resolution encoder. */
-    if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
-        die_codec(&codec[0], "Failed to set noise_sensitivity");
-    for ( i=1; i< NUM_ENCODERS; i++)
-    {
-        if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
-            die_codec(&codec[i], "Failed to set noise_sensitivity");
-    }
-
-
-    frame_avail = 1;
-    got_data = 0;
-
-    while(frame_avail || got_data)
-    {
-        vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
-        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
-
-        flags = 0;
-        frame_avail = read_frame_p(infile, &raw[0]);
-
-        if(frame_avail)
-        {
-            for ( i=1; i<NUM_ENCODERS; i++)
-            {
-                /*Scale the image down a number of times by downsampling factor*/
-                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
-                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
-                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
-                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
-                          raw[i-1].d_w, raw[i-1].d_h,
-                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
-                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
-                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
-                          raw[i].d_w, raw[i].d_h, 1);
-            }
-        }
-
-        /* Encode each frame at multi-levels */
-        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
-            frame_cnt, 1, flags, arg_deadline))
-            die_codec(&codec[0], "Failed to encode frame");
-
-        for (i=NUM_ENCODERS-1; i>=0 ; i--)
-        {
-            got_data = 0;
-
-            while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
-            {
-                got_data = 1;
-                switch(pkt[i]->kind) {
-                    case VPX_CODEC_CX_FRAME_PKT:
-                        write_ivf_frame_header(outfile[i], pkt[i]);
-                        (void) fwrite(pkt[i]->data.frame.buf, 1,
-                                      pkt[i]->data.frame.sz, outfile[i]);
-                    break;
-                    case VPX_CODEC_PSNR_PKT:
-                        if (show_psnr)
-                        {
-                            int j;
-
-                            psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
-                            psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
-                            for (j = 0; j < 4; j++)
-                            {
-                                //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
-                                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
-                            }
-                            psnr_count[i]++;
-                        }
-
-                        break;
-                    default:
-                        break;
-                }
-                printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
-                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
-                fflush(stdout);
-            }
-        }
-        frame_cnt++;
-    }
-    printf("\n");
-
-    fclose(infile);
-
-    printf("Processed %ld frames.\n",(long int)frame_cnt-1);
-    for (i=0; i< NUM_ENCODERS; i++)
-    {
-        /* Calculate PSNR and print it out */
-        if ( (show_psnr) && (psnr_count[i]>0) )
-        {
-            int j;
-            double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
-                                        psnr_sse_total[i]);
-
-            fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
-
-            fprintf(stderr, " %.3lf", ovpsnr);
-            for (j = 0; j < 4; j++)
-            {
-                fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
-            }
-        }
-
-        if(vpx_codec_destroy(&codec[i]))
-            die_codec(&codec[i], "Failed to destroy codec");
-
-        vpx_img_free(&raw[i]);
-
-        if(!outfile[i])
-            continue;
-
-        /* Try to rewrite the file header with the actual frame count */
-        if(!fseek(outfile[i], 0, SEEK_SET))
-            write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
-        fclose(outfile[i]);
-    }
-    printf("\n");
-
-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index 46a40ca..b0961a2 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -50,7 +50,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
@@ -65,10 +64,11 @@
   exit(EXIT_FAILURE);
 }
 
-static void encode_frame(vpx_codec_ctx_t *codec,
-                         vpx_image_t *img,
-                         int frame_index,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -77,6 +77,8 @@
     die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
       if (!vpx_video_writer_write_frame(writer,
@@ -90,6 +92,8 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
 }
 
 int main(int argc, char **argv) {
@@ -160,6 +164,7 @@
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
+  // Encode frames.
   while (vpx_img_read(&raw, infile)) {
     if (frame_count + 1 == update_frame_num) {
       vpx_ref_frame_t ref;
@@ -171,7 +176,9 @@
 
     encode_frame(&codec, &raw, frame_count++, writer);
   }
-  encode_frame(&codec, NULL, -1, writer);
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {};
 
   printf("\n");
   fclose(infile);
diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c
new file mode 100644
index 0000000..3fcda0c
--- /dev/null
+++ b/examples/vp9_lossless_encoder.c
@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
+                  "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+                                               flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  if (argc < 5)
+    die("Invalid number of arguments");
+
+  encoder = get_vpx_encoder_by_name("vp9");
+  if (!encoder)
+     die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 223f37e..7ce3403 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -38,8 +38,10 @@
     ARG_DEF("t", "timebase", 1, "timebase (num/den)");
 static const arg_def_t bitrate_arg = ARG_DEF(
     "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
-static const arg_def_t layers_arg =
-    ARG_DEF("l", "layers", 1, "number of SVC layers");
+static const arg_def_t spatial_layers_arg =
+    ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+    ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
 static const arg_def_t kf_dist_arg =
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
@@ -65,10 +67,11 @@
 
 static const arg_def_t *svc_args[] = {
   &frames_arg,        &width_arg,         &height_arg,
-  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
+  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
   &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
   &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
-  &min_bitrate_arg,   &max_bitrate_arg,   NULL
+  &min_bitrate_arg,   &max_bitrate_arg,   &temporal_layers_arg,
+  NULL
 };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -79,6 +82,7 @@
 static const uint32_t default_timebase_den = 60;
 static const uint32_t default_bitrate = 1000;
 static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_temporal_layers = 1;
 static const uint32_t default_kf_dist = 100;
 
 typedef struct {
@@ -119,6 +123,7 @@
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
   svc_ctx->log_level = SVC_LOG_DEBUG;
   svc_ctx->spatial_layers = default_spatial_layers;
+  svc_ctx->temporal_layers = default_temporal_layers;
 
   // start with default encoder configuration
   res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -156,8 +161,10 @@
       enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &skip_frames_arg, argi)) {
       app_input->frames_to_skip = arg_parse_uint(&arg);
-    } else if (arg_match(&arg, &layers_arg, argi)) {
+    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
       svc_ctx->spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+      svc_ctx->temporal_layers = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_dist_arg, argi)) {
       enc_cfg->kf_min_dist = arg_parse_uint(&arg);
       enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
@@ -275,7 +282,7 @@
   int frame_duration = 1; /* 1 timebase tick per frame */
   FILE *infile = NULL;
   int end_of_stream = 0;
-  int frame_size;
+  int frames_received = 0;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -318,6 +325,8 @@
 
   // Encode frames
   while (!end_of_stream) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
     if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
       // We need one extra vpx_svc_encode call at end of stream to flush
       // encoder and get remaining data
@@ -330,18 +339,34 @@
     if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
     }
-    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
-        vpx_video_writer_write_frame(writer,
-                                     vpx_svc_get_buffer(&svc_ctx),
-                                     frame_size, pts);
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+      switch (cx_pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          if (cx_pkt->data.frame.sz > 0)
+            vpx_video_writer_write_frame(writer,
+                                         cx_pkt->data.frame.buf,
+                                         cx_pkt->data.frame.sz,
+                                         cx_pkt->data.frame.pts);
+
+          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
+                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
+                 (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          ++frames_received;
+          break;
+        }
+        case VPX_CODEC_STATS_PKT: {
+          stats_write(&app_input.rc_stats,
+                      cx_pkt->data.twopass_stats.buf,
+                      cx_pkt->data.twopass_stats.sz);
+          break;
+        }
+        default: {
+          break;
+        }
       }
     }
-    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
-      stats_write(&app_input.rc_stats,
-                  vpx_svc_get_rc_stats_buffer(&svc_ctx),
-                  vpx_svc_get_rc_stats_buffer_size(&svc_ctx));
-    }
+
     if (!end_of_stream) {
       ++frame_cnt;
       pts += frame_duration;
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index be3e7b2..3de983e 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -12,12 +12,12 @@
 //  encoding scheme based on temporal scalability for video applications
 //  that benefit from a scalable bitstream.
 
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
@@ -37,7 +37,8 @@
   kDenoiserOff,
   kDenoiserOnYOnly,
   kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive  // Aggressive mode not implemented currently.
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
 };
 
 static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
@@ -437,7 +438,7 @@
 }
 
 int main(int argc, char **argv) {
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS];
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   int frame_cnt = 0;
@@ -455,7 +456,6 @@
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
   int flag_periodicity = 1;
-  int max_intra_size_pct;
   vpx_svc_layer_id_t layer_id = {0, 0};
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
@@ -569,6 +569,8 @@
     outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
     if (!outfile[i])
       die("Failed to open %s for writing", file_name);
+
+    assert(outfile[i] != NULL);
   }
   // No spatial layers in this encoder.
   cfg.ss_number_layers = 1;
@@ -594,11 +596,11 @@
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
-  max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
-      * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
-  // For low-quality key frame.
-  max_intra_size_pct = 200;
-  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+  {
+    const int max_intra_size_pct = 200;
+    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
 
   frame_avail = 1;
   while (frame_avail || got_data) {
diff --git a/libs.mk b/libs.mk
index 25fbc2c..f9f2d80 100644
--- a/libs.mk
+++ b/libs.mk
@@ -133,6 +133,8 @@
   CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
 
 ifeq ($(CONFIG_ENCODERS),yes)
   CODEC_DOC_SECTIONS += encoder
@@ -529,7 +531,6 @@
 	@echo "    [CREATE] $@"
 	@rm -f $@
 	@echo "INPUT += $^" >> $@
-	@echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@
 	@echo "INCLUDE_PATH += ." >> $@;
 	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
 
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index a9bb540..0221995 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -38,7 +38,7 @@
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
     } else if (video->frame() == 3) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       uint8_t active_map[9 * 13] = {
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
@@ -57,7 +57,7 @@
       map.active_map = active_map;
       encoder->Control(VP8E_SET_ACTIVEMAP, &map);
     } else if (video->frame() == 15) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
       map.active_map = NULL;
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5b4a20e..d1e7f1d 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -581,6 +581,8 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
 const ConvolveFunctions convolve8_c(
     vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
     vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
@@ -600,8 +602,11 @@
     make_tuple(64, 32, &convolve8_c),
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
+#endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
 const ConvolveFunctions convolve8_sse2(
     vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
     vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
@@ -622,6 +627,7 @@
     make_tuple(32, 64, &convolve8_sse2),
     make_tuple(64, 64, &convolve8_sse2)));
 #endif
+#endif
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
@@ -645,27 +651,7 @@
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
 
-#if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h);
-void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h);
-}
-
+#if HAVE_AVX2 && HAVE_SSSE3
 const ConvolveFunctions convolve8_avx2(
     vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
     vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -676,9 +662,7 @@
     make_tuple(8, 4, &convolve8_avx2),
     make_tuple(4, 8, &convolve8_avx2),
     make_tuple(8, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2)));
-
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
+    make_tuple(8, 16, &convolve8_avx2),
     make_tuple(16, 8, &convolve8_avx2),
     make_tuple(16, 16, &convolve8_avx2),
     make_tuple(32, 16, &convolve8_avx2),
@@ -687,7 +671,7 @@
     make_tuple(64, 32, &convolve8_avx2),
     make_tuple(32, 64, &convolve8_avx2),
     make_tuple(64, 64, &convolve8_avx2)));
-#endif
+#endif  // HAVE_AVX2 && HAVE_SSSE3
 
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 7da7b80..4e8019a 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <cmath>
+#include <map>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -24,6 +25,28 @@
 
 class CQTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  // maps the cqlevel to the bitrate produced.
+  typedef std::map<int, uint32_t> BitrateMap;
+
+  static void SetUpTestCase() {
+    bitrates_.clear();
+  }
+
+  static void TearDownTestCase() {
+    ASSERT_TRUE(!HasFailure())
+        << "skipping bitrate validation due to earlier failure.";
+    uint32_t prev_actual_bitrate = kCQTargetBitrate;
+    for (BitrateMap::const_iterator iter = bitrates_.begin();
+         iter != bitrates_.end(); ++iter) {
+      const uint32_t cq_actual_bitrate = iter->second;
+      EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate)
+          << "cq_level: " << iter->first
+          << ", bitrate should decrease with increase in CQ level.";
+      prev_actual_bitrate = cq_actual_bitrate;
+    }
+  }
+
  protected:
   CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
@@ -66,9 +89,12 @@
     return pow(10.0, avg_psnr / 10.0) / file_size_;
   }
 
+  int cq_level() const { return cq_level_; }
   size_t file_size() const { return file_size_; }
   int n_frames() const { return n_frames_; }
 
+  static BitrateMap bitrates_;
+
  private:
   int cq_level_;
   size_t file_size_;
@@ -76,7 +102,8 @@
   int n_frames_;
 };
 
-unsigned int prev_actual_bitrate = kCQTargetBitrate;
+CQTest::BitrateMap CQTest::bitrates_;
+
 TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -91,8 +118,7 @@
   const unsigned int cq_actual_bitrate =
       static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
   EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
-  EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
-  prev_actual_bitrate = cq_actual_bitrate;
+  bitrates_[cq_level()] = cq_actual_bitrate;
 
   // try targeting the approximate same bitrate with VBR mode
   cfg_.rc_end_usage = VPX_VBR;
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 8dcf26c..a3d730a 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -42,6 +42,9 @@
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
@@ -120,9 +123,40 @@
   double file_datarate_;
   double effective_datarate_;
   size_t bits_in_last_frame_;
+  int denoiser_on_;
 };
 
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
+    // refers to the blur thresholds: 20, 40, 60 80.
+    // The j = 0 case (denoiser off) is covered in the tests below.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+        << " The datarate for the file missed the target!";
+  }
+}
+
 TEST_P(DatarateTestLarge, BasicBufferModel) {
+  denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_dropframe_thresh = 1;
   cfg_.rc_max_quantizer = 56;
@@ -154,6 +188,7 @@
 }
 
 TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_max_quantizer = 36;
   cfg_.rc_end_usage = VPX_CBR;
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ee417ce..d1ce109 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -20,12 +20,9 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
-extern "C" {
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
 using libvpx_test::ACMRandom;
 
 namespace {
@@ -258,40 +255,72 @@
   }
 }
 
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
 
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                   int /*tx_type*/) {
   vp9_fdct16x16_c(in, out, stride);
 }
 
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                   int /*tx_type*/) {
   vp9_idct16x16_256_add_c(in, dest, stride);
 }
 
-void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                  int tx_type) {
   vp9_fht16x16_c(in, out, stride, tx_type);
 }
 
-void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
   vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct16x16_256_add_c(in, out, stride, 10);
+}
+
+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct16x16_256_add_c(in, out, stride, 12);
+}
+
+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_10(in, out, stride);
+}
+
+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_12(in, out, stride);
+}
+
+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
 class Trans16x16TestBase {
  public:
   virtual ~Trans16x16TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
 
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunAccuracyCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -300,23 +329,48 @@
     const int count_test_block = 10000;
     for (int i = 0; i < count_test_block; ++i) {
       DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
       DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
       DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                           test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -324,10 +378,10 @@
       }
     }
 
-    EXPECT_GE(1u, max_error)
+    EXPECT_GE(1u  << 2 * (bit_depth_ - 8), max_error)
         << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
 
-    EXPECT_GE(count_test_block , total_error)
+    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
         << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
   }
 
@@ -335,13 +389,13 @@
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -357,21 +411,21 @@
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -381,7 +435,7 @@
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
         EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
             << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
       }
     }
@@ -392,39 +446,65 @@
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
 
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        if (bit_depth_ == VPX_BITS_8)
+          input_block[j] = rnd.Rand8() - rnd.Rand8();
+        else
+          input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0)
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       if (i == 1)
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
 
       // clear reconstructed pixel buffers
       vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
       vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+      vpx_memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+#endif
 
       // quantization with maximum allowed step sizes
       output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
       for (int j = 1; j < kNumCoeffs; ++j)
         output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
-      inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
-
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(ref[j], dst[j]);
+      if (bit_depth_ == VPX_BITS_8) {
+        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+                     tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref[j], dst[j]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref16[j], dst16[j]);
+#endif
+      }
     }
   }
 
@@ -432,28 +512,52 @@
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
       double out_r[kNumCoeffs];
 
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       reference_16x16_dct_2d(in, out_r);
       for (int j = 0; j < kNumCoeffs; ++j)
         coeff[j] = round(out_r[j]);
 
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            16));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         EXPECT_GE(1u, error)
             << "Error: 16x16 IDCT has error " << error
@@ -463,6 +567,8 @@
   }
   int pitch_;
   int tx_type_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
   FhtFunc fwd_txfm_ref;
   IhtFunc inv_txfm_ref;
 };
@@ -477,17 +583,34 @@
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
     pitch_    = 16;
     fwd_txfm_ref = fdct16x16_ref;
     inv_txfm_ref = idct16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case 10:
+        inv_txfm_ref = idct16x16_10_ref;
+        break;
+      case 12:
+        inv_txfm_ref = idct16x16_12_ref;
+        break;
+      default:
+        inv_txfm_ref = idct16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = idct16x16_ref;
+#endif
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -527,17 +650,34 @@
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
     pitch_    = 16;
     fwd_txfm_ref = fht16x16_ref;
     inv_txfm_ref = iht16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = iht16x16_10;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = iht16x16_12;
+        break;
+      default:
+        inv_txfm_ref = iht16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = iht16x16_ref;
+#endif
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -565,45 +705,78 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#endif
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_neon, 0)));
+                   &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct16x16_sse2,
-                   &vp9_idct16x16_256_add_sse2, 0)));
+                   &vp9_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
+                   VPX_BITS_8)));
 #endif
 
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSSE3, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
+                   VPX_BITS_8)));
 #endif
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 4f34a44..c7a1931 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -21,6 +21,7 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -37,7 +38,7 @@
 
 const int kNumCoeffs = 1024;
 const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -55,7 +56,7 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = input[j*32 + i];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 32; ++j)
       output[j * 32 + i] = temp_out[j];
   }
@@ -64,17 +65,28 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = output[j + i*32];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 32; ++j)
       output[j + i * 32] = temp_out[j] / 4;
   }
 }
 
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int> Trans32x32Param;
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+    Trans32x32Param;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct32x32_1024_add_c(in, out, stride, 10);
+}
+
+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct32x32_1024_add_c(in, out, stride, 12);
+}
+#endif
 
 class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
  public:
@@ -84,12 +96,16 @@
     inv_txfm_ = GET_PARAM(1);
     version_  = GET_PARAM(2);  // 0: high precision forward transform
                                // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
   FwdTxfmFunc fwd_txfm_;
   InvTxfmFunc inv_txfm_;
 };
@@ -100,23 +116,47 @@
   int64_t total_error = 0;
   const int count_test_block = 1000;
   DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
   for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
     for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      test_input_block[j] = src[j] - dst[j];
+      if (bit_depth_ == 8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        test_input_block[j] = src16[j] - dst16[j];
+#endif
+      }
     }
 
     ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,
+                                         CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
 
     for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint32_t diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
       const uint32_t diff = dst[j] - src[j];
+#endif
       const uint32_t error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -129,10 +169,10 @@
     total_error /= 45;
   }
 
-  EXPECT_GE(1u, max_error)
+  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
       << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
 
-  EXPECT_GE(count_test_block, total_error)
+  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
       << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
 }
 
@@ -141,12 +181,12 @@
   const int count_test_block = 1000;
 
   DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
   for (int i = 0; i < count_test_block; ++i) {
     for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
     const int stride = 32;
     vp9_fdct32x32_c(input_block, output_ref_block, stride);
@@ -170,21 +210,21 @@
 
   DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
   for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
     for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
     }
     if (i == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = 255;
+        input_extreme_block[j] = mask_;
     } else if (i == 1) {
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = -255;
+        input_extreme_block[j] = -mask_;
     }
 
     const int stride = 32;
@@ -201,9 +241,9 @@
         EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
             << "Error: 32x32 FDCT rd has mismatched coefficients";
       }
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
           << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
           << "Error: 32x32 FDCT has coefficient larger than "
           << "4*DCT_MAX_VALUE";
     }
@@ -214,26 +254,49 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
   for (int i = 0; i < count_test_block; ++i) {
     double out_r[kNumCoeffs];
 
     // Initialize a test block with input range [-255, 255]
     for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      in[j] = src[j] - dst[j];
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        in[j] = src16[j] - dst16[j];
+#endif
+      }
     }
 
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < kNumCoeffs; ++j)
       coeff[j] = round(out_r[j]);
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
     for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
       const int diff = dst[j] - src[j];
+#endif
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 32x32 IDCT has error " << error
@@ -244,39 +307,59 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
-        make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
+        make_tuple(&vp9_high_fdct32x32_c,
+                   &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct32x32_rd_c,
+                   &idct32x32_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct32x32_c,
+                   &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fdct32x32_rd_c,
+                   &idct32x32_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_neon, 0),
+                   &vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),
         make_tuple(&vp9_fdct32x32_rd_c,
-                   &vp9_idct32x32_1024_add_neon, 1)));
+                   &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vp9_fdct32x32_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
         make_tuple(&vp9_fdct32x32_rd_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif
 
-#if HAVE_AVX2
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vp9_fdct32x32_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
         make_tuple(&vp9_fdct32x32_rd_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
 #endif
 }  // namespace
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 11529b3..5a71140 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -74,7 +74,7 @@
   libvpx_test::WebMVideoSource video(video_name);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 99610eb..0ef4f7b 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -106,7 +106,7 @@
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   RunLoop(video, dec_cfg);
 }
 
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 1f73c7d..a757b59 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -125,20 +125,20 @@
                        const vpx_codec_dec_cfg_t &dec_cfg);
 
   // Hook to be called before decompressing every frame.
-  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
-                                  Decoder *decoder) {}
+  virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+                                  Decoder* /*decoder*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const CompressedVideoSource& /* video */,
+                                  const CompressedVideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     const unsigned int frame_number) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     const unsigned int /*frame_number*/) {}
 
   // Hook to be called on peek result
   virtual void HandlePeekResult(Decoder* const decoder,
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 6d4281d..9702ddf 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -133,13 +133,13 @@
   return match;
 }
 
-void EncoderTest::MismatchHook(const vpx_image_t *img1,
-                               const vpx_image_t *img2) {
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+                               const vpx_image_t* /*img2*/) {
   ASSERT_TRUE(0) << "Encode/Decode mismatch found";
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
 
   stats_.Reset();
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 2270ce2..a77bd64 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -189,20 +189,21 @@
   virtual void RunLoop(VideoSource *video);
 
   // Hook to be called at the beginning of a pass.
-  virtual void BeginPassHook(unsigned int pass) {}
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
 
   // Hook to be called at the end of a pass.
   virtual void EndPassHook() {}
 
   // Hook to be called before encoding a frame.
-  virtual void PreEncodeFrameHook(VideoSource *video) {}
-  virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+                                  Encoder* /*encoder*/) {}
 
   // Hook to be called on every compressed data packet.
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
@@ -218,19 +219,19 @@
                             const vpx_image_t *img2);
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     vpx_codec_pts_t pts) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     vpx_codec_pts_t /*pts*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const VideoSource& /* video */,
+                                  const VideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook that can modify the encoder's output data
-  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
       const vpx_codec_cx_pkt_t *pkt) {
     return pkt;
   }
diff --git a/test/examples.sh b/test/examples.sh
index 7ba9cce..39f7e39 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples vpxdec vpxenc tools_common"
+exclude_list="examples tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index fb0449d..44eba33 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -285,7 +285,7 @@
     video_->Init();
     video_->Begin();
 
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 7c48260..f803c8e 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -20,46 +20,71 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
-extern "C" {
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
 using libvpx_test::ACMRandom;
 
 namespace {
 const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
 
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
   vp9_fdct4x4_c(in, out, stride);
 }
 
-void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fht4x4_c(in, out, stride, tx_type);
 }
 
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
   vp9_fwht4x4_c(in, out, stride);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct4x4_16_add_c(in, out, stride, 10);
+}
+
+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct4x4_16_add_c(in, out, stride, 12);
+}
+
+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_iwht4x4_16_add_c(in, out, stride, 12);
+}
+#endif
+
 class Trans4x4TestBase {
  public:
   virtual ~Trans4x4TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
 
-  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunAccuracyCheck(int limit) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -68,23 +93,47 @@
     const int count_test_block = 10000;
     for (int i = 0; i < count_test_block; ++i) {
       DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
       DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
       DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                           test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -105,13 +154,13 @@
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 5000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -127,21 +176,21 @@
     const int count_test_block = 5000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
       }
       if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
       } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -151,8 +200,8 @@
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
         EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
       }
     }
   }
@@ -161,24 +210,48 @@
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       fwd_txfm_ref(in, coeff, pitch_, tx_type_);
 
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
 
       for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const uint32_t diff = dst[j] - src[j];
+#endif
         const uint32_t error = diff * diff;
         EXPECT_GE(static_cast<uint32_t>(limit), error)
             << "Error: 4x4 IDCT has error " << error
@@ -190,6 +263,8 @@
   int pitch_;
   int tx_type_;
   FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
 };
 
 class Trans4x4DCT
@@ -204,14 +279,16 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fdct4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -247,15 +324,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
 
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -291,14 +370,16 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 4;
     fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -323,57 +404,95 @@
 }
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_neon, 0)));
+                   &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
 #endif
 
-#if CONFIG_USE_X86INC && HAVE_MMX
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     MMX, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct4x4_sse2,
-                   &vp9_idct4x4_16_add_sse2, 0)));
+                   &vp9_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
 #endif
 
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 567e5f6..5455534 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -20,45 +20,96 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
-extern "C" {
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
+const int kNumCoeffs = 64;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 8; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 8; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
+                          double output[kNumCoeffs]) {
+  // First transform columns
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = input[j*8 + i];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 8; ++j)
+      output[j * 8 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = output[j + i*8];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] * 2;
+  }
 }
 
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                         int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
 
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fdct8x8_c(in, out, stride);
 }
 
-void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fht8x8_c(in, out, stride, tx_type);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct8x8_64_add_c(in, out, stride, 10);
+}
+
+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct8x8_64_add_c(in, out, stride, 12);
+}
+
+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
 class FwdTrans8x8TestBase {
  public:
   virtual ~FwdTrans8x8TestBase() {}
 
  protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
 
   void RunSignBiasCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_output_block, 64);
     int count_sign_block[64][2];
     const int count_test_block = 100000;
 
@@ -67,7 +118,8 @@
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < 64; ++j)
-        test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
+                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
       ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_output_block, pitch_));
 
@@ -82,7 +134,7 @@
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = 1125;
-      EXPECT_LT(diff, max_diff)
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -111,7 +163,7 @@
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = 10000;
-      EXPECT_LT(diff, max_diff)
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 4x4 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
@@ -127,16 +179,28 @@
     int total_error = 0;
     const int count_test_block = 100000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < 64; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
       }
 
       ASM_REGISTER_STATE_CHECK(
@@ -152,11 +216,23 @@
             test_temp_block[j] *= 4;
           }
       }
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const int diff = dst[j] - src[j];
+#endif
         const int error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -164,11 +240,11 @@
       }
     }
 
-    EXPECT_GE(1, max_error)
+    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
       << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
       << " roundtrip error > 1";
 
-    EXPECT_GE(count_test_block/5, total_error)
+    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
       << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
       << "error > 1/5 per block";
   }
@@ -180,37 +256,68 @@
     int total_coeff_error = 0;
     const int count_test_block = 100000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_temp_block, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
 
     for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
       for (int j = 0; j < 64; ++j) {
-        if (i == 0) {
-          src[j] = 255;
-          dst[j] = 0;
-        } else if (i == 1) {
-          src[j] = 0;
-          dst[j] = 255;
+        if (bit_depth_ == VPX_BITS_8) {
+          if (i == 0) {
+            src[j] = 255;
+            dst[j] = 0;
+          } else if (i == 1) {
+            src[j] = 0;
+            dst[j] = 255;
+          } else {
+            src[j] = rnd.Rand8() % 2 ? 255 : 0;
+            dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          }
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
         } else {
-          src[j] = rnd.Rand8() % 2 ? 255 : 0;
-          dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          if (i == 0) {
+            src16[j] = mask_;
+            dst16[j] = 0;
+          } else if (i == 1) {
+            src16[j] = 0;
+            dst16[j] = mask_;
+          } else {
+            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          }
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
         }
-
-        test_input_block[j] = src[j] - dst[j];
       }
 
       ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       ASM_REGISTER_STATE_CHECK(
           fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
 
       for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
         const int diff = dst[j] - src[j];
+#endif
         const int error = diff * diff;
         if (max_error < error)
           max_error = error;
@@ -220,11 +327,11 @@
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1, max_error)
+      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
           << "an individual roundtrip error > 1";
 
-      EXPECT_GE(count_test_block/5, total_error)
+      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
@@ -234,9 +341,97 @@
     }
   }
 
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8() % 2 ? 255 : 0;
+          dst[j] = src[j] > 0 ? 0 : 255;
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          dst16[j] = src16[j] > 0 ? 0 : mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_r, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
+
+      RunFwdTxfm(in, coeff, pitch_);
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff_r[j] = round(out_r[j]);
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = coeff[j] - coeff_r[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 DCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
   int pitch_;
   int tx_type_;
   FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
 };
 
 class FwdTrans8x8DCT
@@ -251,15 +446,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 8;
     fwd_txfm_ref = fdct8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
 
@@ -279,6 +476,14 @@
   RunExtremalCheck();
 }
 
+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
 class FwdTrans8x8HT
     : public FwdTrans8x8TestBase,
       public ::testing::TestWithParam<Ht8x8Param> {
@@ -291,15 +496,17 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 8;
     fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -321,50 +528,81 @@
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#endif
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
+                   VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
+                   VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
 #endif
 
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSSE3, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
+                   VPX_BITS_8)));
 #endif
 }  // namespace
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 2400c20..1c9a522 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -27,7 +27,7 @@
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource &video,
+                                  const libvpx_test::VideoSource& /*video*/,
                                   libvpx_test::Decoder *decoder) {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
@@ -72,7 +72,13 @@
   // one for each lag in frames (for 2 pass), and then one for each possible
   // reference buffer (8) - we can end up with up to 30 buffers of roughly this
   // size or almost 1 gig of memory.
+  // In total the allocations will exceed 2GiB which may cause a failure with
+  // mingw + wine, use a smaller size in that case.
+#if defined(_WIN32) && !defined(_WIN64)
+  video.SetSize(4096, 3072);
+#else
   video.SetSize(4096, 4096);
+#endif
   video.set_limit(2);
   expected_res_ = VPX_CODEC_OK;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 5f4c33a..f488cb4 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -109,7 +109,8 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 10000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t input[64], coeff[64];
+    int16_t input[64];
+    tran_low_t coeff[64];
     double output_r[64];
     uint8_t dst[64], src[64];
 
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index ead4760..f0d9c34 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -294,6 +294,11 @@
                         ::testing::Values(
                             vp8_build_intra_predictors_mby_s_ssse3));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_neon));
+#endif
 
 typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
                                 uint8_t *uabove_row,
@@ -382,5 +387,10 @@
                         ::testing::Values(
                             vp8_build_intra_predictors_mbuv_s_ssse3));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_neon));
+#endif
 
 }  // namespace
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 0a1c17c..b61d490 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -73,7 +73,7 @@
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
     libvpx_test::CompressedVideoSource *video = NULL;
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.threads = input.threads;
     const std::string filename = input.filename;
 
@@ -113,9 +113,14 @@
 const DecodeParam kVP9InvalidFileTests[] = {
   {1, "invalid-vp90-02-v2.webm"},
   {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
-  {1, "invalid-vp90-03-v2.webm"},
+  {1, "invalid-vp90-03-v3.webm"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
+  {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
 };
 
 VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -126,9 +131,9 @@
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const decoder,
-                                libvpx_test::CompressedVideoSource *video,
-                                const vpx_codec_err_t res_peek) {}
+  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                                libvpx_test::CompressedVideoSource* /*video*/,
+                                const vpx_codec_err_t /*res_peek*/) {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
@@ -144,6 +149,10 @@
 
 const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
   {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+  {4, "invalid-"
+      "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
 };
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dc95582..1db712b 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,7 +28,8 @@
       // plane, we never want to round down and thus skip a pixel so if
       // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
       // This works only for chroma_shift of 0 and 1.
-      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
       const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                     img->y_chroma_shift : img->d_h;
       const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 15f4e6c..9c24fee 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -26,8 +26,8 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tr1::tuple<FwdTxfmFunc,
                         InvTxfmFunc,
                         InvTxfmFunc,
@@ -74,8 +74,8 @@
       FAIL() << "Wrong Size!";
       break;
   }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
 
@@ -83,7 +83,7 @@
   const int block_size = size * size;
 
   DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kMaxNumCoeffs);
 
   int max_error = 0;
   for (int i = 0; i < count_test_block; ++i) {
@@ -153,8 +153,8 @@
       FAIL() << "Wrong Size!";
       break;
   }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
   DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
   const int count_test_block = 1000;
@@ -229,6 +229,7 @@
                    &vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
+
 #if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
@@ -259,7 +260,7 @@
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
     ::testing::Values(
@@ -293,7 +294,7 @@
                    TX_4X4, 1)));
 #endif
 
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSSE3_64, PartialIDctTest,
     ::testing::Values(
@@ -303,7 +304,7 @@
                    TX_8X8, 12)));
 #endif
 
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSSE3, PartialIDctTest,
     ::testing::Values(
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 8d08f1e..9d0c570 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -211,8 +211,8 @@
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
 #if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -222,8 +222,8 @@
     // Write frame header and data.
     write_ivf_frame_header(pkt, outfile_);
     (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
   }
+#endif
 
   double frame0_psnr_;
 #if WRITE_COMPRESSED_STREAM
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e63770b..5377c1e 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -505,21 +505,6 @@
 INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 #endif  // CONFIG_VP8_ENCODER
 
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const SadMxNVp9Param mmx_vp9_tests[] = {
-  make_tuple(16, 16, sad_16x16_mmx_vp9),
-  make_tuple(8, 16, sad_8x16_mmx_vp9),
-  make_tuple(16, 8, sad_16x8_mmx_vp9),
-  make_tuple(8, 8, sad_8x8_mmx_vp9),
-  make_tuple(4, 4, sad_4x4_mmx_vp9),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_MMX
 
 #if HAVE_SSE
diff --git a/test/set_maps.sh b/test/set_maps.sh
new file mode 100755
index 0000000..e7c8d43
--- /dev/null
+++ b/test/set_maps.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx set_maps example. To add new tests to this file,
+##  do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBVPX_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(vpx_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    set_maps vp8 || return 1
+  fi
+}
+
+set_maps_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    set_maps vp9 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_vp8
+                set_maps_vp9"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 6619fb1..ff42725 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -105,7 +105,7 @@
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_c));
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_neon));
 #endif
diff --git a/test/svc_test.cc b/test/svc_test.cc
index e9cf38d..5035dee 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -13,6 +13,9 @@
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/i420_video_source.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
@@ -21,6 +24,7 @@
 
 using libvpx_test::CodecFactory;
 using libvpx_test::Decoder;
+using libvpx_test::DxDataIterator;
 using libvpx_test::VP9CodecFactory;
 
 class SvcTest : public ::testing::Test {
@@ -56,15 +60,311 @@
     codec_enc_.kf_min_dist = 100;
     codec_enc_.kf_max_dist = 100;
 
-    vpx_codec_dec_cfg_t dec_cfg = {0};
+    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
   }
 
   virtual void TearDown() {
-    vpx_svc_release(&svc_);
+    ReleaseEncoder();
     delete(decoder_);
+  }
+
+  void InitializeEncoder() {
+    const vpx_codec_err_t res =
+        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
+    codec_initialized_ = true;
+  }
+
+  void ReleaseEncoder() {
+    vpx_svc_release(&svc_);
     if (codec_initialized_) vpx_codec_destroy(&codec_);
+    codec_initialized_ = false;
+  }
+
+  void GetStatsData(std::string *const stats_buf) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
+        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
+        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
+        stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
+                          cx_pkt->data.twopass_stats.sz);
+      }
+    }
+  }
+
+  void Pass1EncodeNFrames(const int n, const int layers,
+                          std::string *const stats_buf) {
+    vpx_codec_err_t res;
+
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      GetStatsData(stats_buf);
+      video.Next();
+    }
+
+    // Flush encoder and test EOS packet.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    ASSERT_EQ(VPX_CODEC_OK, res);
+    GetStatsData(stats_buf);
+
+    ReleaseEncoder();
+  }
+
+  void StoreFrames(const size_t max_frame_received,
+                   struct vpx_fixed_buf *const outputs,
+                   size_t *const frame_received) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+        const size_t frame_size = cx_pkt->data.frame.sz;
+
+        EXPECT_GT(frame_size, 0U);
+        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
+        ASSERT_LT(*frame_received, max_frame_received);
+
+        if (*frame_received == 0)
+          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
+
+        outputs[*frame_received].buf = malloc(frame_size + 16);
+        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
+               frame_size);
+        outputs[*frame_received].sz = frame_size;
+        ++(*frame_received);
+      }
+    }
+  }
+
+  void Pass2EncodeNFrames(std::string *const stats_buf,
+                          const int n, const int layers,
+                          struct vpx_fixed_buf *const outputs) {
+    vpx_codec_err_t res;
+    size_t frame_received = 0;
+
+    ASSERT_TRUE(outputs != NULL);
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.rc_target_bitrate = 500;
+    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
+      ASSERT_TRUE(stats_buf != NULL);
+      ASSERT_GT(stats_buf->size(), 0U);
+      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
+      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
+    }
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      StoreFrames(n, outputs, &frame_received);
+      video.Next();
+    }
+
+    // Flush encoder.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    StoreFrames(n, outputs, &frame_received);
+
+    EXPECT_EQ(frame_received, static_cast<size_t>(n));
+
+    ReleaseEncoder();
+  }
+
+  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
+    int decoded_frames = 0;
+    int received_frames = 0;
+
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+      const vpx_codec_err_t res_dec =
+          decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
+                                inputs[i].sz);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+      ++decoded_frames;
+
+      DxDataIterator dec_iter = decoder_->GetDxData();
+      while (dec_iter.Next() != NULL) {
+        ++received_frames;
+      }
+    }
+    EXPECT_EQ(decoded_frames, n);
+    EXPECT_EQ(received_frames, n);
+  }
+
+  void DropLayersAndMakeItVP9Comaptible(struct vpx_fixed_buf *const inputs,
+                                        const int num_super_frames,
+                                        const int remained_spatial_layers,
+                                        const bool is_multiple_frame_contexts) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(num_super_frames, 0);
+    ASSERT_GT(remained_spatial_layers, 0);
+
+    for (int i = 0; i < num_super_frames; ++i) {
+      uint32_t frame_sizes[8] = {0};
+      int frame_count = 0;
+      int frames_found = 0;
+      int frame;
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+
+      vpx_codec_err_t res =
+          vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
+                                     inputs[i].sz, frame_sizes, &frame_count,
+                                     NULL, NULL);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+
+      if (frame_count == 0) {
+        // There's no super frame but only a single frame.
+        ASSERT_EQ(1, remained_spatial_layers);
+        if (is_multiple_frame_contexts) {
+          // Make a new super frame.
+          uint8_t marker = 0xc1;
+          unsigned int mask;
+          int mag;
+
+          // Choose the magnitude.
+          for (mag = 0, mask = 0xff; mag < 4; ++mag) {
+            if (inputs[i].sz < mask)
+              break;
+            mask <<= 8;
+            mask |= 0xff;
+          }
+          marker |= mag << 3;
+          int index_sz = 2 + (mag + 1) * 2;
+
+          inputs[i].buf = realloc(inputs[i].buf, inputs[i].sz + index_sz + 16);
+          ASSERT_TRUE(inputs[i].buf != NULL);
+          uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+          frame_data[0] &= ~2;      // Set the show_frame flag to 0.
+          frame_data += inputs[i].sz;
+          // Add an one byte frame with show_existing_frame.
+          *frame_data++ = 0x88;
+
+          // Write the super frame index.
+          *frame_data++ = marker;
+
+          frame_sizes[0] = inputs[i].sz;
+          frame_sizes[1] = 1;
+          for (int j = 0; j < 2; ++j) {
+            unsigned int this_sz = frame_sizes[j];
+            for (int k = 0; k <= mag; k++) {
+              *frame_data++ = this_sz & 0xff;
+              this_sz >>= 8;
+            }
+          }
+          *frame_data++ = marker;
+          inputs[i].sz += index_sz + 1;
+        }
+      } else {
+        // Found a super frame.
+        uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+        uint8_t *frame_start = frame_data;
+        for (frame = 0; frame < frame_count; ++frame) {
+          // Looking for a visible frame.
+          if (frame_data[0] & 0x02) {
+            ++frames_found;
+            if (frames_found == remained_spatial_layers)
+              break;
+          }
+          frame_data += frame_sizes[frame];
+        }
+        ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+            << "remained_spatial_layers: " << remained_spatial_layers
+            << "    super_frame: " << i
+            << "    is_multiple_frame_context: " << is_multiple_frame_contexts;
+        if (frame == frame_count - 1 && !is_multiple_frame_contexts)
+          continue;
+
+        frame_data += frame_sizes[frame];
+
+        // We need to add one more frame for multiple frame contexts.
+        if (is_multiple_frame_contexts)
+          ++frame;
+        uint8_t marker =
+            static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
+        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+        const size_t index_sz = 2 + mag * frame_count;
+        const size_t new_index_sz = 2 + mag * (frame + 1);
+        marker &= 0x0f8;
+        marker |= frame;
+
+        // Copy existing frame sizes.
+        memmove(frame_data + (is_multiple_frame_contexts ? 2 : 1),
+                frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2);
+        if (is_multiple_frame_contexts) {
+          // Add a one byte frame with flag show_existing_frame.
+          *frame_data++ = 0x88 | (remained_spatial_layers - 1);
+        }
+        // New marker.
+        frame_data[0] = marker;
+        frame_data += (mag * (frame + 1) + 1);
+
+        if (is_multiple_frame_contexts) {
+          // Write the frame size for the one byte frame.
+          frame_data -= mag;
+          *frame_data++ = 1;
+          for (uint32_t j = 1; j < mag; ++j) {
+            *frame_data++ = 0;
+          }
+        }
+
+        *frame_data++ = marker;
+        inputs[i].sz = frame_data - frame_start;
+
+        if (is_multiple_frame_contexts) {
+          // Change the show frame flag to 0 for all frames.
+          for (int j = 0; j < frame; ++j) {
+            frame_start[0] &= ~2;
+            frame_start += frame_sizes[j];
+          }
+        }
+      }
+    }
+  }
+
+  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      free(inputs[i].buf);
+      inputs[i].buf = NULL;
+      inputs[i].sz = 0;
+    }
   }
 
   SvcContext svc_;
@@ -93,9 +393,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   svc_.spatial_layers = 0;  // use default layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
 }
 
@@ -106,9 +404,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");  // valid scale values
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, InvalidOptions) {
@@ -122,20 +418,17 @@
 }
 
 TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(3, svc_.spatial_layers);
 }
 
 TEST_F(SvcTest, SetMultipleOptions) {
   vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(2, svc_.spatial_layers);
 }
 
@@ -149,9 +442,7 @@
 
   res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetQuantizersOption) {
@@ -162,9 +453,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "quantizers=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetAutoAltRefOption) {
@@ -180,9 +469,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetQuantizers) {
@@ -200,9 +487,7 @@
 
   res = vpx_svc_set_quantizers(&svc_, "40,30");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetScaleFactors) {
@@ -220,121 +505,25 @@
 
   res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 // Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, FirstFrameHasLayers) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  video.Begin();
-
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  if (vpx_svc_get_frame_size(&svc_) == 0) {
-    // Flush encoder
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                         video.duration(), VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-  }
-
-  int frame_size = vpx_svc_get_frame_size(&svc_);
-  EXPECT_GT(frame_size, 0);
-  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-
-  // this test fails with a decoder error
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+TEST_F(SvcTest, OnePassEncodeOneFrame) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf output = {0};
+  Pass2EncodeNFrames(NULL, 1, 2, &output);
+  DecodeNFrames(&output, 1);
+  FreeBitstreamBuffers(&output, 1);
 }
 
-TEST_F(SvcTest, EncodeThreeFrames) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  EXPECT_EQ(decoded_frames, 3);
+TEST_F(SvcTest, OnePassEncodeThreeFrames) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf outputs[3];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 3);
+  FreeBitstreamBuffers(&outputs[0], 3);
 }
 
 TEST_F(SvcTest, GetLayerResolution) {
@@ -342,14 +531,11 @@
   vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
   vpx_svc_set_quantizers(&svc_, "40,30");
 
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 
   // ensure that requested layer is a valid layer
   uint32_t layer_width, layer_height;
-  res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
+  vpx_codec_err_t res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
                                      &layer_width, &layer_height);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
@@ -373,131 +559,313 @@
   EXPECT_EQ(kHeight * 8 / 16, layer_height);
 }
 
-TEST_F(SvcTest, TwoPassEncode) {
+TEST_F(SvcTest, TwoPassEncode10Frames) {
   // First pass encode
   std::string stats_buf;
-  svc_.spatial_layers = 2;
-  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // FRAME 1
-  video.Next();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Flush encoder and test EOS packet
-  res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Tear down encoder
-  vpx_svc_release(&svc_);
-  vpx_codec_destroy(&codec_);
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
   // Second pass encode
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
   codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(20, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
   vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
-  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
 
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 5, &stats_buf);
 
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+  DecodeNFrames(&outputs[0], 10);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 4, false);
+  DecodeNFrames(&outputs[0], 10);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 3, false);
+  DecodeNFrames(&outputs[0], 10);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, false);
+  DecodeNFrames(&outputs[0], 10);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+  DecodeNFrames(&outputs[0], 10);
 
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(20, 2, &stats_buf);
 
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1 scale-factors=1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(20, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 2, false);
+  DecodeNFrames(&outputs[0], 20);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 1, false);
+  DecodeNFrames(&outputs[0], 20);
+
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, SetMultipleFrameContextsOption) {
+  svc_.spatial_layers = 5;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
   EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(10, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+  vpx_fixed_buf outputs_new[10];
+  for (int i = 0; i < 10; ++i) {
+    outputs_new[i].buf = malloc(outputs[i].sz + 16);
+    ASSERT_TRUE(outputs_new[i].buf != NULL);
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
   }
+  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 3, true);
+  DecodeNFrames(&outputs_new[0], 10);
 
-  EXPECT_EQ(decoded_frames, 3);
+  for (int i = 0; i < 10; ++i) {
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
+  }
+  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 2, true);
+  DecodeNFrames(&outputs_new[0], 10);
+
+  for (int i = 0; i < 10; ++i) {
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
+  }
+  DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 1, true);
+  DecodeNFrames(&outputs_new[0], 10);
+
+  FreeBitstreamBuffers(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs_new[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
 }
 
 }  // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index ee6289f..e6114ab 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -10,8 +10,8 @@
 25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01-v2.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02-v2.webm
 8e2eff4af87d2b561cce2365713269e301457ef3  invalid-vp90-02-v2.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v2.webm
-25dd58c22d23f75304d7ce7f69f4e5b02ef9119a  invalid-vp90-03-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612  invalid-vp90-03-v3.webm.res
 d637297561dd904eb2c97a9015deeb31c4a1e8d2  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
 a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
@@ -681,3 +681,23 @@
 c77e4a26616add298a05dd5d12397be22c0e40c5  vp90-2-18-resize.ivf
 c12918cf0a716417fba2de35c3fc5ab90e52dfce  vp90-2-18-resize.ivf.md5
 717da707afcaa1f692ff1946f291054eb75a4f06  screendata.y4m
+b7c1296630cdf1a7ef493d15ff4f9eb2999202f6  invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+bcdedaf168ac225575468fda77502d2dc9fd5baa  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+65e93f9653bcf65b022f7d225268d1a90a76e7bb  vp90-2-19-skip.webm
+368dccdde5288c13c25695d2eacdc7402cadf613  vp90-2-19-skip.webm.md5
+ffe460282df2b0e7d4603c2158653ad96f574b02  vp90-2-19-skip-01.webm
+bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85  vp90-2-19-skip-01.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+552e372e9b78127389fb06b34545df2cec15ba6d  invalid-vp91-2-mixedrefcsp-444to420.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c  invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+812d05a64a0d83c1b504d0519927ddc5a2cdb273  invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d  invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
diff --git a/test/test.mk b/test/test.mk
index 0814c2b..b92b6da 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -785,6 +785,10 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
 
@@ -793,16 +797,32 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index dbdbdd6..cccebf8 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -181,7 +181,8 @@
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
   "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
-  "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm",
+  "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
+  "vp90-2-19-skip-01.webm", "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d714452..b9f879d 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -29,7 +29,7 @@
         md5_inv_order_(),
         n_tiles_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    vpx_codec_dec_cfg_t cfg;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.w = 704;
     cfg.h = 144;
     cfg.threads = 1;
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 0bfefba..15c7bce 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -144,6 +144,24 @@
   fi
 }
 
+# Echoes path to $1 when it's executable and exists in ${LIBVPX_BIN_PATH}, or an
+# empty string. Caller is responsible for testing the string once the function
+# returns.
+vpx_tool_path() {
+  local readonly tool_name="$1"
+  local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  if [ ! -x "${tool_path}" ]; then
+    # Try one directory up: when running via examples.sh the tool could be in
+    # the parent directory of $LIBVPX_BIN_PATH.
+    tool_path="${LIBVPX_BIN_PATH}/../${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  fi
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
 # Echoes yes to stdout when the file named by positional parameter one exists
 # in LIBVPX_BIN_PATH, and is executable.
 vpx_tool_available() {
diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
index 22fce85..8512d88 100644
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -47,7 +47,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
   libvpx_test::MD5 md5;
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 7d81182..f76402e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -35,6 +35,14 @@
 using ::std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
 static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
                                  int l2w, int l2h, unsigned int *sse_ptr) {
   int se = 0;
@@ -76,6 +84,50 @@
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
 
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
+}
+
 template<typename VarianceFunctionType>
 class VarianceTest
     : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -88,7 +140,7 @@
     height_ = 1 << log2height_;
     variance_ = get<2>(params);
 
-    rnd(ACMRandom::DeterministicSeed());
+    rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     ref_ = new uint8_t[block_size_];
@@ -107,7 +159,7 @@
   void RefTest();
   void OneQuarterTest();
 
-  ACMRandom rnd;
+  ACMRandom rnd_;
   uint8_t* src_;
   uint8_t* ref_;
   int width_, log2width_;
@@ -135,8 +187,8 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-      src_[j] = rnd.Rand8();
-      ref_[j] = rnd.Rand8();
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -206,7 +258,7 @@
     height_ = 1 << log2height_;
     subpel_variance_ = get<2>(params);
 
-    rnd(ACMRandom::DeterministicSeed());
+    rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
@@ -226,7 +278,7 @@
  protected:
   void RefTest();
 
-  ACMRandom rnd;
+  ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
@@ -241,10 +293,10 @@
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
       for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
+        src_[j] = rnd_.Rand8();
       }
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+        ref_[j] = rnd_.Rand8();
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -263,11 +315,11 @@
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
       for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
-        sec_[j] = rnd.Rand8();
+        src_[j] = rnd_.Rand8();
+        sec_[j] = rnd_.Rand8();
       }
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+        ref_[j] = rnd_.Rand8();
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -362,6 +414,13 @@
 namespace vp9 {
 
 #if CONFIG_VP9_ENCODER
+
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+                        ::testing::Values(vp9_get_mb_ss_c));
+
 typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
 typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
 typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
@@ -485,23 +544,12 @@
                       make_tuple(6, 5, subpel_avg_variance64x32_c),
                       make_tuple(6, 6, subpel_avg_variance64x64_c)));
 
-#if HAVE_MMX
-const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
-const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
-const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;
-const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
-const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
-                      make_tuple(3, 3, variance8x8_mmx),
-                      make_tuple(3, 4, variance8x16_mmx),
-                      make_tuple(4, 3, variance16x8_mmx),
-                      make_tuple(4, 4, variance16x16_mmx)));
-#endif
-
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
+
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(vp9_get_mb_ss_sse2));
+
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
diff --git a/test/video_source.h b/test/video_source.h
index c924f96..84bfa8e 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -92,12 +92,7 @@
  protected:
   void CloseFile() {
     if (file_) {
-      // Close if file pointer is associated with an open file
-#if defined(_WIN32)
-      if (file_->_ptr != NULL) fclose(file_);
-#else
-      if (fileno(file_) != -1) fclose(file_);
-#endif
+      fclose(file_);
       file_ = NULL;
     }
   }
diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
index 470fdf1..972a1d9 100644
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP8Decoder decoder(dec_cfg, 0);
 
   video.Begin();
diff --git a/test/vp8_multi_resolution_encoder.sh b/test/vp8_multi_resolution_encoder.sh
new file mode 100755
index 0000000..a8b7fe7
--- /dev/null
+++ b/test/vp8_multi_resolution_encoder.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp8_multi_resolution_encoder example. To add new
+##  tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp8_mre_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8_multi_resolution_encoder_verify_environment() {
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ ! -e "${YUV_RAW_INPUT}" ]; then
+      elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+    local readonly app="vp8_multi_resolution_encoder"
+    if [ -z "$(vpx_tool_path "${app}")" ]; then
+      elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
+      return 1
+    fi
+  fi
+}
+
+# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
+# vp8_multi_resolution_encoder after building path to the executable.
+vp8_mre() {
+  local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull}
+}
+
+vp8_multi_resolution_encoder_three_formats() {
+  local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ "$(vp8_encode_available)" = "yes" ]; then
+      # Param order:
+      #  Input width
+      #  Input height
+      #  Input file path
+      #  Output file names
+      #  Output PSNR
+      vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" \
+        "${YUV_RAW_INPUT}" \
+        ${output_files} \
+        0
+
+      for output_file in ${output_files}; do
+        if [ ! -e "${output_file}" ]; then
+          elog "Missing output file: ${output_file}"
+          return 1
+        fi
+      done
+    fi
+  fi
+}
+
+vp8_mre_tests="vp8_multi_resolution_encoder_three_formats"
+run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}"
diff --git a/test/vp9_decrypt_test.cc b/test/vp9_decrypt_test.cc
index 88a3c14..d988612 100644
--- a/test/vp9_decrypt_test.cc
+++ b/test/vp9_decrypt_test.cc
@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP9Decoder decoder(dec_cfg, 0);
 
   video.Begin();
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index d7fc4ee..cc35476 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -163,7 +163,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = num_threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index 836b13c..f92acbd 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -17,14 +17,13 @@
 # Environment check: Make sure input is available.
 vpxdec_verify_environment() {
   if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
-}
-
-# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
-vpxdec_available() {
-  [ -n "$(vpx_tool_available vpxdec)" ] && echo yes
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
 }
 
 # Wrapper function for running vpxdec with pipe input. Requires that
@@ -32,8 +31,8 @@
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxdec.
 vpxdec_pipe() {
-  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-  local input="$1"
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
 }
@@ -42,22 +41,20 @@
 # the directory containing vpxdec. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxdec.
 vpxdec() {
-  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-  local input="${1}"
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
 }
 
 vpxdec_can_decode_vp8() {
-  if [ "$(vpxdec_available)" = "yes" ] && \
-     [ "$(vp8_decode_available)" = "yes" ]; then
+  if [ "$(vp8_decode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
 vpxdec_can_decode_vp9() {
-  if [ "$(vpxdec_available)" = "yes" ] && \
-     [ "$(vp9_decode_available)" = "yes" ]; then
+  if [ "$(vp9_decode_available)" = "yes" ]; then
     echo yes
   fi
 }
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index 6e9ad35..9674bdc 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -20,28 +20,59 @@
 # Environment check: Make sure input is available.
 vpxenc_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
     return 1
   fi
 }
 
 vpxenc_can_encode_vp8() {
-  if [ "$(vpxenc_available)" = "yes" ] && \
-     [ "$(vp8_encode_available)" = "yes" ]; then
+  if [ "$(vp8_encode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
 vpxenc_can_encode_vp9() {
-  if [ "$(vpxenc_available)" = "yes" ] && \
-     [ "$(vp9_encode_available)" = "yes" ]; then
+  if [ "$(vp9_encode_available)" = "yes" ]; then
     echo yes
   fi
 }
 
-# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
-vpxenc_available() {
-  [ -n "$(vpx_tool_available vpxenc)" ] && echo yes
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+  local readonly codec="${1:-vp8}"
+  echo "--codec=${codec}
+    --buf-initial-sz=500
+    --buf-optimal-sz=600
+    --buf-sz=1000
+    --cpu-used=-5
+    --end-usage=cbr
+    --error-resilient=1
+    --kf-max-dist=90000
+    --lag-in-frames=0
+    --max-intra-rate=300
+    --max-q=56
+    --min-q=2
+    --noise-sensitivity=0
+    --overshoot-pct=50
+    --passes=1
+    --profile=0
+    --resize-allowed=0
+    --rt
+    --static-thresh=0
+    --undershoot-pct=50"
 }
 
 # Wrapper function for running vpxenc with pipe input. Requires that
@@ -49,51 +80,34 @@
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxenc.
 vpxenc_pipe() {
-  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
   local readonly input="$1"
   shift
-  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - "$@" ${devnull}
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
+    --test-decode=fatal \
+    "$@" ${devnull}
 }
 
 # Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to
 # the directory containing vpxenc. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxenc.
 vpxenc() {
-  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
-  local readonly input="${1}"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly input="$1"
   shift
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" "$@" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
+    --test-decode=fatal \
+    "$@" ${devnull}
 }
 
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-  fi
-}
-
-vpxenc_vp8_ivf_piped_input() {
-  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
-    cat "${YUV_RAW_INPUT}" \
-      | vpxenc --codec=vp8 \
-        --width="${YUV_RAW_INPUT_WIDTH}" \
-        --height="${YUV_RAW_INPUT_HEIGHT}" \
-        --limit="${TEST_FRAMES}" \
-        --ivf \
-        --output="${output}" \
-        -
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -106,12 +120,78 @@
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp8) \
+      --output="${output}"
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --auto-alt-ref=1 \
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_ivf_piped_input() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    vpxenc_pipe $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -123,13 +203,11 @@
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -142,12 +220,42 @@
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp9) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --passes=2
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -159,15 +267,12 @@
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
-      --lossless=1 \
-      --test-decode=fatal \
-      "${YUV_RAW_INPUT}"
+      --lossless=1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -179,16 +284,34 @@
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
       --min-q=0 \
-      --max-q=0 \
-      --test-decode=fatal \
-      "${YUV_RAW_INPUT}"
+      --max-q=0
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --passes=2 \
+      --auto-alt-ref=1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -199,10 +322,16 @@
 
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
+              vpxenc_vp8_webm_rt
+              vpxenc_vp8_webm_2pass
+              vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
+              vpxenc_vp9_webm_rt
+              vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
-              vpxenc_vp9_ivf_minq0_maxq0"
+              vpxenc_vp9_ivf_minq0_maxq0
+              vpxenc_vp9_webm_lag10_frames20"
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index d4a2ede..58a6fe3 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -57,7 +57,7 @@
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
     const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
                    img->y_chroma_shift : img->d_h);
     const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
@@ -141,11 +141,11 @@
   Y4mVideoWriteTest() {}
 
   virtual ~Y4mVideoWriteTest() {
-    CloseSource();
     delete tmpfile_;
+    input_file_ = NULL;
   }
 
-  virtual void ReplaceInputFile(FILE *input_file) {
+  void ReplaceInputFile(FILE *input_file) {
     CloseSource();
     frame_ = 0;
     input_file_ = input_file;
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index fa5b498..3869d25 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1041
+Version: 1060
 License: BSD
 License File: LICENSE
 
@@ -13,4 +13,4 @@
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
-None.
+cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 82fd95d..8423121 100644
--- a/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -153,7 +153,6 @@
      int* subsample_x, int* subsample_y, int number_of_components);
 
  private:
-
   void AllocOutputBuffers(int num_outbufs);
   void DestroyOutputBuffers();
 
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index fdfe1ae..4b3c870 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -252,6 +252,94 @@
 
 // The following are available on arm64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+// #define HAS_I444TOARGBROW_NEON
+// #define HAS_I422TOARGBROW_NEON
+// #define HAS_I411TOARGBROW_NEON
+// #define HAS_I422TOBGRAROW_NEON
+// #define HAS_I422TOABGRROW_NEON
+// #define HAS_I422TORGBAROW_NEON
+// #define HAS_I422TORGB24ROW_NEON
+// #define HAS_I422TORAWROW_NEON
+// #define HAS_I422TORGB565ROW_NEON
+// #define HAS_I422TOARGB1555ROW_NEON
+// #define HAS_I422TOARGB4444ROW_NEON
+// #define HAS_YTOARGBROW_NEON
+// #define HAS_I400TOARGBROW_NEON
+// #define HAS_NV12TOARGBROW_NEON
+// #define HAS_NV21TOARGBROW_NEON
+// #define HAS_NV12TORGB565ROW_NEON
+// #define HAS_NV21TORGB565ROW_NEON
+// #define HAS_YUY2TOARGBROW_NEON
+// #define HAS_UYVYTOARGBROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROWS_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+// #define HAS_RGB565TOARGBROW_NEON
+// #define HAS_ARGB1555TOARGBROW_NEON
+// #define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+// #define HAS_ARGBTORGB565ROW_NEON
+// #define HAS_ARGBTOARGB1555ROW_NEON
+// #define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+// #define HAS_ARGBTOUV444ROW_NEON
+// #define HAS_ARGBTOUV422ROW_NEON
+// #define HAS_ARGBTOUV411ROW_NEON
+// #define HAS_ARGBTOUVROW_NEON
+// #define HAS_ARGBTOUVJROW_NEON
+// #define HAS_BGRATOUVROW_NEON
+// #define HAS_ABGRTOUVROW_NEON
+// #define HAS_RGBATOUVROW_NEON
+// #define HAS_RGB24TOUVROW_NEON
+// #define HAS_RAWTOUVROW_NEON
+// #define HAS_RGB565TOUVROW_NEON
+// #define HAS_ARGB1555TOUVROW_NEON
+// #define HAS_ARGB4444TOUVROW_NEON
+// #define HAS_RGB565TOYROW_NEON
+// #define HAS_ARGB1555TOYROW_NEON
+// #define HAS_ARGB4444TOYROW_NEON
+// #define HAS_BGRATOYROW_NEON
+// #define HAS_ABGRTOYROW_NEON
+// #define HAS_RGBATOYROW_NEON
+// #define HAS_RGB24TOYROW_NEON
+// #define HAS_RAWTOYROW_NEON
+// #define HAS_INTERPOLATEROW_NEON
+// #define HAS_ARGBBLENDROW_NEON
+// #define HAS_ARGBATTENUATEROW_NEON
+// #define HAS_ARGBQUANTIZEROW_NEON
+// #define HAS_ARGBSHADEROW_NEON
+// #define HAS_ARGBGRAYROW_NEON
+// #define HAS_ARGBSEPIAROW_NEON
+// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
 #endif
 
 // The following are available on Neon platforms:
@@ -465,7 +553,7 @@
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
 #endif  // defined(__native_client__) && defined(__x86_64__)
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 #undef MEMACCESS
 #if defined(__native_client__)
 #define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 8dc0762..3c49542 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -51,6 +51,14 @@
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
+#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__aarch64__) || defined(LIBYUV_NEON))
+/* #define HAS_SCALEROWDOWN2_NEON */
+/* #define HAS_SCALEROWDOWN4_NEON */
+/* #define HAS_SCALEROWDOWN34_NEON */
+/* #define HAS_SCALEROWDOWN38_NEON */
+/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
+/* #define HAS_SCALEARGBROWDOWN2_NEON */
 #endif
 
 // The following are available on Mips platforms:
diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
index 912c4c9..73a7f1b 100644
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1041
+#define LIBYUV_VERSION 1059
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc
index 9ea81b4..dc715e0 100644
--- a/third_party/libyuv/source/compare.cc
+++ b/third_party/libyuv/source/compare.cc
@@ -80,7 +80,7 @@
 
 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #endif
diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc
index 5e7b8e4..55052c0 100644
--- a/third_party/libyuv/source/compare_neon.cc
+++ b/third_party/libyuv/source/compare_neon.cc
@@ -56,6 +56,45 @@
   return sse;
 }
 
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %2, %2, #16                    \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "bgt        1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
index 874a6cb..a8e294f 100644
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc
@@ -401,7 +401,7 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
   int y;
-  int halfheight = (height + 1) >> 1;
+  int halfheight;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
   void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
       int pix) = YUY2ToUV422Row_C;
@@ -711,11 +711,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -963,9 +965,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1022,36 +1021,44 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB24TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
-    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_RGB24TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
@@ -1075,9 +1082,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_raw || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1134,36 +1138,42 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RAWTOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
-    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-    RAWToARGBRow(src_raw, row, width);
-    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_raw += src_stride_raw * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+  #if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+  #else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+  #endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+  #if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+  #else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+  #endif
+    }
+  #if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  #endif
   }
-  if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
-#else
-    RAWToARGBRow(src_raw, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
-#if !defined(HAS_RAWTOYROW_NEON)
-  free_aligned_buffer_64(row);
-#endif
   return 0;
 }
 
@@ -1187,9 +1197,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1246,36 +1253,44 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB565TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
-    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_rgb565 += src_stride_rgb565 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_RGB565TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
@@ -1299,9 +1314,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1358,38 +1370,45 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB1555TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
-    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                   width);
-#else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+  {
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
-    src_argb1555 += src_stride_argb1555 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
 #else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-  }
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
 #if !defined(HAS_ARGB1555TOYROW_NEON)
   free_aligned_buffer_64(row);
 #endif
+  }
   return 0;
 }
 
@@ -1413,9 +1432,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1472,38 +1488,46 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB4444TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
-    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                   width);
-#else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_argb4444 += src_stride_argb4444 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_ARGB4444TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
index 121a416..de461dd 100644
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -60,6 +60,13 @@
         }
       }
   }
+#elif defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -76,10 +83,8 @@
 #elif defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
-      ARGBToUV444Row = ARGBToUV444Row_NEON;
     }
   }
 #endif
@@ -134,6 +139,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 
 #if defined(HAS_ARGBTOYROW_SSSE3)
@@ -153,12 +165,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -228,11 +234,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 32) {
-      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-      if (IS_ALIGNED(width, 32)) {
-        ARGBToUV411Row = ARGBToUV411Row_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
     }
   }
 #endif
@@ -261,9 +269,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -296,11 +301,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -331,22 +338,27 @@
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -364,9 +376,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -399,11 +408,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -434,22 +445,27 @@
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -493,6 +509,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -510,12 +533,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -594,6 +611,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -611,12 +635,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -1022,11 +1040,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
     }
   }
 #endif
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index 2e0d61d..8f8a403 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -14,8 +14,9 @@
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    !defined(__native_client__)  && \
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
@@ -97,7 +98,7 @@
   uint32 xcr0 = 0u;
 #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(_M_IX86)
+#elif defined(_M_IX86) && defined(_MSC_VER)
   __asm {
     xor        ecx, ecx    // xcr 0
     _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
@@ -256,12 +257,17 @@
   if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
     cpu_info_ &= ~kCpuHasMIPS_DSPR2;
   }
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
 #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
   cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#elif defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
 #else
   // Linux arm parse text file for neon detect.
   cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc
index a3daf96..3c17371 100644
--- a/third_party/libyuv/source/format_conversion.cc
+++ b/third_party/libyuv/source/format_conversion.cc
@@ -332,11 +332,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc
index 15b0ed8..36028c3 100644
--- a/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -13,8 +13,8 @@
 #ifdef HAVE_JPEG
 #include <assert.h>
 
-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
-    !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
@@ -101,7 +101,7 @@
   }
 
   buf_.data = src;
-  buf_.len = (int)(src_len);
+  buf_.len = static_cast<int>(src_len);
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -411,7 +411,7 @@
 }
 
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
     assert(0 && "No more data");
     // ERROR: No more data
@@ -447,7 +447,7 @@
   // ERROR: Error in jpeglib: buf
 #endif
 
-  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
   // This rewinds the call stack to the point of the corresponding setjmp()
   // and causes it to return (for a second time) with value 1.
   longjmp(mgr->setjmp_buffer, 1);
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 97ef844..ce8b3da 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -79,9 +79,13 @@
 YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
      1, 2, 7)
 YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
 #endif  // HAS_I422TOARGBROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+#endif  // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
+#endif  // HAS_I422TOUYVYROW_NEON
 #undef YANY
 
 // Wrappers to handle odd width
@@ -250,12 +254,26 @@
 YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
 YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
 YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
 YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
 YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
 YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
 YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
 YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
 YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
 YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
 #endif
 #undef YANY
@@ -333,7 +351,11 @@
 UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
 UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
 UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
 UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
 UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
 #endif
 #undef UVANY
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
index 46e9ceb..21111cf 100644
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -824,19 +824,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
     MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
     "bgt        1b                             \n"
     :
       "+r"(src_u),   // %0
@@ -862,7 +862,7 @@
       "+r"(dst_uv),  // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2  // Output registers
   :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@
 #ifdef HAS_SETROW_NEON
 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "st1        {v0.16b}, [%0], #16            \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
   : "r"(v32)     // %2
-  : "cc", "memory", "q0"
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_SETROW_NEON
@@ -922,26 +922,25 @@
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                    \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@
                       int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r12, #-16                      \n"
     "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    "st1        {v0.8b}, [%1], #8               \n"  // dst += 8
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
+    "st1        {v1.8b}, [%2], #8               \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_u),   // %1
     "+r"(dst_v),   // %2
     "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +1005,20 @@
 #ifdef HAS_RGB24TOARGBROW_NEON
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
+    "movi       v4.8b, #255                    \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "ld3        {v1.8b-v3.8b}, [%0], #24       \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "st4        {v1.8b-v4.8b}, [%1], #32       \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),   // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_RGB24TOARGBROW_NEON
@@ -1028,21 +1026,22 @@
 #ifdef HAS_RAWTOARGBROW_NEON
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
+    "movi       v5.8b, #255                    \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "ld3        {v0.8b-v2.8b}, [%0], #24       \n"  // read r g b
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "mov        v3.8b, v1.8b                   \n"  // move g
+    "mov        v4.8b, v0.8b                   \n"  // move r
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "st4        {v2.8b-v5.8b}, [%1], #32       \n"  // store b g r a
     "bgt        1b                             \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_RAWTOARGBROW_NEON
@@ -1170,16 +1169,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "st3        {v1.8b-v3.8b}, [%1], #24       \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_rgb24),  // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORGB24ROW_NEON
@@ -1190,17 +1189,18 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load b g r a
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "mov        v4.8b, v2.8b                   \n"  // mov g
+    "mov        v5.8b, v1.8b                   \n"  // mov b
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "st3        {v3.8b-v5.8b}, [%1], #24       \n"  // store r g b
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_raw),   // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORAWROW_NEON
@@ -1211,16 +1211,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOYROW_NEON
@@ -1231,16 +1231,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOYROW_NEON
@@ -1252,19 +1252,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUV422ROW_NEON
@@ -1276,19 +1276,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUV422ROW_NEON
@@ -1297,20 +1297,20 @@
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row YUY2.
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),     // %0
     "+r"(stride_yuy2),  // %1
@@ -1318,7 +1318,7 @@
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUVROW_NEON
@@ -1327,20 +1327,20 @@
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row UYVY.
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),     // %0
     "+r"(stride_uyvy),  // %1
@@ -1348,7 +1348,7 @@
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUVROW_NEON
@@ -1358,23 +1358,23 @@
                   uint8* dst_uv, int pix) {
   asm volatile (
     // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
+    "add        %x1, %x0, %w1, sxtw            \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    "ld1        {v1.16b}, [%1], #16            \n"  // load row 2 16 pixels.
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"  // average row 1 and 2
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
+    "st1        {v0.16b}, [%2], #16            \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),         // %0
     "+r"(src_uv_stride),  // %1
     "+r"(dst_uv),         // %2
     "+r"(pix)             // %3
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_HALFROW_NEON
@@ -1384,22 +1384,22 @@
 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                          uint32 selector, int pix) {
   asm volatile (
-    "vmov.u32   d6[0], %3                      \n"  // selector
+    "mov        v2.s[0], %w3                   \n"  // selector
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
+    "ld1        {v0.16b, v1.16b}, [%0], 32     \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    "tbl        v4.8b, {v0.16b}, v2.8b         \n"  // look up 4 pixels
+    "tbl        v5.8b, {v1.16b}, v2.8b         \n"  // look up 4 pixels
+    "trn1       v4.4s, v4.4s, v5.4s            \n"  // combine 8 pixels
     MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
+    "st1        {v4.8b}, [%1], #8              \n"  // store 8.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
   : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v4", "v5"   // Clobber List
   );
 }
 #endif  // HAS_ARGBTOBAYERROW_NEON
@@ -1411,16 +1411,16 @@
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_NEON
@@ -1431,21 +1431,20 @@
                          const uint8* shuffler, int pix) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
     "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_NEON
@@ -1459,14 +1458,15 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "mov        v2.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 YUY2/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
@@ -1474,7 +1474,7 @@
     "+r"(dst_yuy2),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOYUY2ROW_NEON
@@ -1488,14 +1488,15 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v1.8b, v2.8b}, [%0], #16      \n"  // load 16 Ys
+    "mov        v3.8b, v2.8b                   \n"
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 UYVY/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
@@ -1503,7 +1504,7 @@
     "+r"(dst_uyvy),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOUYVYROW_NEON
@@ -1577,28 +1578,28 @@
 #ifdef HAS_ARGBTOYROW_NEON
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1607,26 @@
 #ifdef HAS_ARGBTOYJROW_NEON
 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBTOYJROW_NEON
@@ -3048,20 +3049,20 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3069,7 +3070,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBMULTIPLYROW_NEON
@@ -3083,14 +3084,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3098,7 +3101,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBADDROW_NEON
@@ -3112,14 +3115,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3127,7 +3132,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +3146,27 @@
 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "mov        v1.8b, v0.8b                   \n"
+    "mov        v2.8b, v0.8b                   \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELROW_NEON
@@ -3175,20 +3180,20 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +3207,25 @@
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELXYROW_NEON
@@ -3236,28 +3241,28 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
     MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
+    "ld1        {v3.8b}, [%2],%6               \n"
     "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -3266,7 +3271,7 @@
     "+r"(width)        // %4
   : "r"(2),            // %5
     "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELXROW_NEON
@@ -3282,28 +3287,28 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
+    "ld1        {v3.8b}, [%1],%5               \n"
     "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -3311,7 +3316,7 @@
     "+r"(width)        // %3
   : "r"(1),            // %4
     "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELYROW_NEON
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 8eb8889..d79c353 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -10,7 +10,7 @@
 
 #include "libyuv/row.h"
 
-#if defined (_M_X64)
+#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -21,7 +21,8 @@
 #endif
 
 // This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || defined(_M_X64))
 
 #define YG 74  /* (int8)(1.164 * 64 + 0.5) */
 
@@ -78,7 +79,6 @@
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
-
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const __m128i xmm4 = _mm_setzero_si128();
@@ -132,7 +132,6 @@
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
-
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const __m128i xmm4 = _mm_setzero_si128();
diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..64c7d10
--- /dev/null
+++ b/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,790 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+#endif //ScaleRowDown34_0_Box_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#if 0
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+#endif //0
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_SCALEARGBROWDOWNEVEN_NEON
+#endif  // __aarch64__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/tools_common.c b/tools_common.c
index 7cfd066..2ec1711 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -83,7 +83,7 @@
   struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
   int plane = 0;
   int shortread = 0;
-  const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+  const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     uint8_t *ptr;
@@ -241,7 +241,8 @@
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
diff --git a/tools_common.h b/tools_common.h
index 558413e..c1f466b 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -103,17 +103,25 @@
 extern "C" {
 #endif
 
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...);
-void fatal(const char *fmt, ...);
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
 
 /* The tool including this file must define usage_exit() */
-void usage_exit();
+void usage_exit() VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index f37ca63..5840c2b 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -25,22 +25,18 @@
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
 #endif
 
-#if HAVE_NEON_ASM || HAVE_NEON
+#if HAVE_NEON
 typedef void loopfilter_y_neon(unsigned char *src, int pitch,
         unsigned char blimit, unsigned char limit, unsigned char thresh);
 typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
         unsigned char blimit, unsigned char limit, unsigned char thresh,
         unsigned char *v);
-#endif
 
-#if HAVE_NEON_ASM
 extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
 extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-#endif
 
-#if HAVE_NEON
 extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
 extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
 extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
@@ -150,9 +146,7 @@
     if (u_ptr)
         vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
-#endif
 
-#if HAVE_NEON_ASM
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index d77f2ba..9824a31 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,7 @@
 
 #include <arm_neon.h>
 
-static const uint16_t bifilter4_coeff[8][2] = {
+static const uint8_t bifilter4_coeff[8][2] = {
     {128,   0},
     {112,  16},
     { 96,  32},
@@ -64,8 +64,8 @@
         q1u8 = vcombine_u8(d2u8, d3u8);
         q2u8 = vcombine_u8(d4u8, d5u8);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
         q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@@ -155,8 +155,8 @@
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q5u8 = vld1q_u8(src_ptr);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@@ -245,8 +245,8 @@
         q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
deleted file mode 100644
index a8730aa..0000000
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null
@@ -1,595 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
-    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
-    push            {r4-r8, lr}
-    vpush           {d8-d15}
-
-    cmp             r3, #0
-    beq             case_dc_pred
-    cmp             r3, #1
-    beq             case_v_pred
-    cmp             r3, #2
-    beq             case_h_pred
-    cmp             r3, #3
-    beq             case_tm_pred
-
-case_dc_pred
-    ldr             r4, [sp, #88]       ; Up
-    ldr             r5, [sp, #92]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-case_v_pred
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q0}, [r1]!
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_h_pred
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_tm_pred
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1]!
-    vst1.u8         {q1}, [r1]!
-    vst1.u8         {q2}, [r1]!
-    vst1.u8         {q3}, [r1]!
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-    ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0    unsigned char *y_buffer
-; r1    unsigned char *ypred_ptr
-; r2    int y_stride
-; r3    int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
-    push            {r4-r8, lr}
-    vpush           {d8-d15}
-
-    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
-    cmp             r3, #0
-    beq             case_dc_pred_s
-    cmp             r3, #1
-    beq             case_v_pred_s
-    cmp             r3, #2
-    beq             case_h_pred_s
-    cmp             r3, #3
-    beq             case_tm_pred_s
-
-case_dc_pred_s
-    ldr             r4, [sp, #88]       ; Up
-    ldr             r5, [sp, #92]       ; Left
-
-    ; Default the DC average to 128
-    mov             r12, #128
-    vdup.u8         q0, r12
-
-    ; Zero out running sum
-    mov             r12, #0
-
-    ; compute shift and jump
-    adds            r7, r4, r5
-    beq             skip_dc_pred_up_left_s
-
-    ; Load above row, if it exists
-    cmp             r4, #0
-    beq             skip_dc_pred_up_s
-
-    sub             r6, r0, r2
-    vld1.8          {q1}, [r6]
-    vpaddl.u8       q2, q1
-    vpaddl.u16      q3, q2
-    vpaddl.u32      q4, q3
-
-    vmov.32         r4, d8[0]
-    vmov.32         r6, d9[0]
-
-    add             r12, r4, r6
-
-    ; Move back to interger registers
-
-skip_dc_pred_up_s
-
-    cmp             r5, #0
-    beq             skip_dc_pred_left_s
-
-    sub             r0, r0, #1
-
-    ; Load left row, if it exists
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0]
-
-    add             r12, r12, r3
-    add             r12, r12, r4
-    add             r12, r12, r5
-    add             r12, r12, r6
-
-skip_dc_pred_left_s
-    add             r7, r7, #3          ; Shift
-    sub             r4, r7, #1
-    mov             r5, #1
-    add             r12, r12, r5, lsl r4
-    mov             r5, r12, lsr r7     ; expected_dc
-
-    vdup.u8         q0, r5
-
-skip_dc_pred_up_left_s
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-case_v_pred_s
-    ; Copy down above row
-    sub             r6, r0, r2
-    vld1.8          {q0}, [r6]
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q0}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_h_pred_s
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u8         q0, r3
-    vdup.u8         q1, r4
-    vdup.u8         q2, r5
-    vdup.u8         q3, r6
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-case_tm_pred_s
-    ; Load yabove_row
-    sub             r3, r0, r2
-    vld1.8          {q8}, [r3]
-
-    ; Load ytop_left
-    sub             r3, r3, #1
-    ldrb            r7, [r3]
-
-    vdup.u16        q7, r7
-
-    ; Compute yabove_row - ytop_left
-    mov             r3, #1
-    vdup.u8         q0, r3
-
-    vmull.u8        q4, d16, d0
-    vmull.u8        q5, d17, d0
-
-    vsub.s16        q4, q4, q7
-    vsub.s16        q5, q5, q7
-
-    ; Load 4x yleft_col
-    sub             r0, r0, #1
-    mov             r12, #4
-
-case_tm_pred_loop_s
-    ldrb            r3, [r0], r2
-    ldrb            r4, [r0], r2
-    ldrb            r5, [r0], r2
-    ldrb            r6, [r0], r2
-    vdup.u16        q0, r3
-    vdup.u16        q1, r4
-    vdup.u16        q2, r5
-    vdup.u16        q3, r6
-
-    vqadd.s16       q8, q0, q4
-    vqadd.s16       q9, q0, q5
-
-    vqadd.s16       q10, q1, q4
-    vqadd.s16       q11, q1, q5
-
-    vqadd.s16       q12, q2, q4
-    vqadd.s16       q13, q2, q5
-
-    vqadd.s16       q14, q3, q4
-    vqadd.s16       q15, q3, q5
-
-    vqshrun.s16     d0, q8, #0
-    vqshrun.s16     d1, q9, #0
-
-    vqshrun.s16     d2, q10, #0
-    vqshrun.s16     d3, q11, #0
-
-    vqshrun.s16     d4, q12, #0
-    vqshrun.s16     d5, q13, #0
-
-    vqshrun.s16     d6, q14, #0
-    vqshrun.s16     d7, q15, #0
-
-    vst1.u8         {q0}, [r1], r2
-    vst1.u8         {q1}, [r1], r2
-    vst1.u8         {q2}, [r1], r2
-    vst1.u8         {q3}, [r1], r2
-
-    subs            r12, r12, #1
-    bne             case_tm_pred_loop_s
-
-    vpop            {d8-d15}
-    pop             {r4-r8,pc}
-
-    ENDP
-
-
-    END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
deleted file mode 100644
index 3a39210..0000000
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,81 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_0_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq,
-;                            unsigned char *dst, int stride);
-; r0   *q
-; r1   dq
-; r2   *dst
-; r3   stride
-|idct_dequant_0_2x_neon| PROC
-    push            {r4, r5}
-    vpush           {d8-d15}
-
-    add             r12, r2, #4
-    vld1.32         {d2[0]}, [r2], r3
-    vld1.32         {d8[0]}, [r12], r3
-    vld1.32         {d2[1]}, [r2], r3
-    vld1.32         {d8[1]}, [r12], r3
-    vld1.32         {d4[0]}, [r2], r3
-    vld1.32         {d10[0]}, [r12], r3
-    vld1.32         {d4[1]}, [r2], r3
-    vld1.32         {d10[1]}, [r12], r3
-
-    ldrh            r12, [r0]               ; lo q
-    ldrh            r4, [r0, #32]           ; hi q
-    mov             r5, #0
-    strh            r5, [r0]
-    strh            r5, [r0, #32]
-
-    sxth            r12, r12                ; lo
-    mul             r0, r12, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q0, r0
-    sxth            r4, r4                  ; hi
-    mul             r0, r4, r1
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    vdup.16         q3, r0
-
-    vaddw.u8        q1, q0, d2              ; lo
-    vaddw.u8        q2, q0, d4
-    vaddw.u8        q4, q3, d8              ; hi
-    vaddw.u8        q5, q3, d10
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r0, r2, #4
-
-    vqmovun.s16     d2, q1                  ; lo
-    vqmovun.s16     d4, q2
-    vqmovun.s16     d8, q4                  ; hi
-    vqmovun.s16     d10, q5
-
-    vst1.32         {d2[0]}, [r2], r3       ; lo
-    vst1.32         {d8[0]}, [r0], r3       ; hi
-    vst1.32         {d2[1]}, [r2], r3
-    vst1.32         {d8[1]}, [r0], r3
-    vst1.32         {d4[0]}, [r2], r3
-    vst1.32         {d10[0]}, [r0], r3
-    vst1.32         {d4[1]}, [r2]
-    vst1.32         {d10[1]}, [r0]
-
-    vpop            {d8-d15}
-    pop             {r4, r5}
-    bx              lr
-
-    ENDP            ; |idct_dequant_0_2x_neon|
-    END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000..967c322
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+        int16_t *q,
+        int16_t dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int i, a0, a1;
+    int16x8x2_t q2Add;
+    int32x2_t d2s32, d4s32;
+    uint8x8_t d2u8, d4u8;
+    uint16x8_t q1u16, q2u16;
+
+    a0 = ((q[0] * dq) + 4) >> 3;
+    a1 = ((q[16] * dq) + 4) >> 3;
+    q[0] = q[16] = 0;
+    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+    for (i = 0; i < 2; i++, dst += 4) {
+        dst0 = dst;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d2s32));
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d4s32));
+
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+        d2s32 = vreinterpret_s32_u8(d2u8);
+        d4s32 = vreinterpret_s32_u8(d4u8);
+
+        dst0 = dst;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+    }
+    return;
+}
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
deleted file mode 100644
index 8da0fa0..0000000
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |idct_dequant_full_2x_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq,
-;                               unsigned char *dst, int stride);
-; r0    *q,
-; r1    *dq,
-; r2    *dst
-; r3    stride
-|idct_dequant_full_2x_neon| PROC
-    vpush           {d8-d15}
-
-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
-    vld1.16         {q2, q3}, [r0]          ; l q
-    add             r0, r0, #32
-    vld1.16         {q4, q5}, [r0]          ; r q
-    add             r12, r2, #4
-
-    ; interleave the predictors
-    vld1.32         {d28[0]}, [r2],  r3     ; l pre
-    vld1.32         {d28[1]}, [r12], r3     ; r pre
-    vld1.32         {d29[0]}, [r2],  r3
-    vld1.32         {d29[1]}, [r12], r3
-    vld1.32         {d30[0]}, [r2],  r3
-    vld1.32         {d30[1]}, [r12], r3
-    vld1.32         {d31[0]}, [r2],  r3
-    vld1.32         {d31[1]}, [r12]
-
-    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant
-
-    ; dequant: q[i] = q[i] * dq[i]
-    vmul.i16        q2, q2, q0
-    vmul.i16        q3, q3, q1
-    vmul.i16        q4, q4, q0
-    vmul.i16        q5, q5, q1
-
-    vld1.16         {d0}, [r1]
-
-    ; q2: l0r0  q3: l8r8
-    ; q4: l4r4  q5: l12r12
-    vswp            d5, d8
-    vswp            d7, d10
-
-    ; _CONSTANTS_ * 4,12 >> 16
-    ; q6:  4 * sinpi : c1/temp1
-    ; q7: 12 * sinpi : d1/temp2
-    ; q8:  4 * cospi
-    ; q9: 12 * cospi
-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q7, q5, d0[2]
-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
-    vqdmulh.s16     q9, q5, d0[0]
-
-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
-
-    ; vqdmulh only accepts signed values. this was a problem because
-    ; our constant had the high bit set, and was treated as a negative value.
-    ; vqdmulh also doubles the value before it shifts by 16. we need to
-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
-    ; so we can shift the constant without losing precision. this avoids
-    ; shift again afterward, but also avoids the sign issue. win win!
-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
-    ; pre-shift it
-    vshr.s16        q8, q8, #1
-    vshr.s16        q9, q9, #1
-
-    ; q4:  4 +  4 * cospi : d1/temp1
-    ; q5: 12 + 12 * cospi : c1/temp2
-    vqadd.s16       q4, q4, q8
-    vqadd.s16       q5, q5, q9
-
-    ; c1 = temp1 - temp2
-    ; d1 = temp1 + temp2
-    vqsub.s16       q2, q6, q5
-    vqadd.s16       q3, q4, q7
-
-    ; [0]: a1+d1
-    ; [1]: b1+c1
-    ; [2]: b1-c1
-    ; [3]: a1-d1
-    vqadd.s16       q4, q10, q3
-    vqadd.s16       q5, q11, q2
-    vqsub.s16       q6, q11, q2
-    vqsub.s16       q7, q10, q3
-
-    ; rotate
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-    ; idct loop 2
-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
-    ; q6: l 2, 6,10,14 r 2, 6,10,14
-    ; q7: l 3, 7,11,15 r 3, 7,11,15
-
-    ; q8:  1 * sinpi : c1/temp1
-    ; q9:  3 * sinpi : d1/temp2
-    ; q10: 1 * cospi
-    ; q11: 3 * cospi
-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
-    vqdmulh.s16     q9, q7, d0[2]
-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
-    vqdmulh.s16     q11, q7, d0[0]
-
-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
-
-    ; see note on shifting above
-    vshr.s16        q10, q10, #1
-    vshr.s16        q11, q11, #1
-
-    ; q10: 1 + 1 * cospi : d1/temp1
-    ; q11: 3 + 3 * cospi : c1/temp2
-    vqadd.s16       q10, q5, q10
-    vqadd.s16       q11, q7, q11
-
-    ; q8: c1 = temp1 - temp2
-    ; q9: d1 = temp1 + temp2
-    vqsub.s16       q8, q8, q11
-    vqadd.s16       q9, q10, q9
-
-    ; a1+d1
-    ; b1+c1
-    ; b1-c1
-    ; a1-d1
-    vqadd.s16       q4, q2, q9
-    vqadd.s16       q5, q3, q8
-    vqsub.s16       q6, q3, q8
-    vqsub.s16       q7, q2, q9
-
-    ; +4 >> 3 (rounding)
-    vrshr.s16       q4, q4, #3              ; lo
-    vrshr.s16       q5, q5, #3
-    vrshr.s16       q6, q6, #3              ; hi
-    vrshr.s16       q7, q7, #3
-
-    vtrn.32         q4, q6
-    vtrn.32         q5, q7
-    vtrn.16         q4, q5
-    vtrn.16         q6, q7
-
-    ; adding pre
-    ; input is still packed. pre was read interleaved
-    vaddw.u8        q4, q4, d28
-    vaddw.u8        q5, q5, d29
-    vaddw.u8        q6, q6, d30
-    vaddw.u8        q7, q7, d31
-
-    vmov.i16        q14, #0
-    vmov            q15, q14
-    vst1.16         {q14, q15}, [r0]        ; write over high input
-    sub             r0, r0, #32
-    vst1.16         {q14, q15}, [r0]        ; write over low input
-
-    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
-    add             r1, r2, #4              ; hi
-
-    ;saturate and narrow
-    vqmovun.s16     d0, q4                  ; lo
-    vqmovun.s16     d1, q5
-    vqmovun.s16     d2, q6                  ; hi
-    vqmovun.s16     d3, q7
-
-    vst1.32         {d0[0]}, [r2], r3       ; lo
-    vst1.32         {d0[1]}, [r1], r3       ; hi
-    vst1.32         {d1[0]}, [r2], r3
-    vst1.32         {d1[1]}, [r1], r3
-    vst1.32         {d2[0]}, [r2], r3
-    vst1.32         {d2[1]}, [r1], r3
-    vst1.32         {d3[0]}, [r2]
-    vst1.32         {d3[1]}, [r1]
-
-    vpop            {d8-d15}
-    bx             lr
-
-    ENDP           ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2       DCD 0x4546
-
-    END
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000..a60ed46
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+        int16_t *q,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0, *dst1;
+    int32x2_t d28, d29, d30, d31;
+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x4x2_t q2tmp0, q2tmp1;
+    int16x8x2_t q2tmp2, q2tmp3;
+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+    d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+    // load dq
+    q0 = vld1q_s16(dq);
+    dq += 8;
+    q1 = vld1q_s16(dq);
+
+    // load q
+    q2 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q3 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q4 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q5 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+
+    // load src from dst
+    dst0 = dst;
+    dst1 = dst + 4;
+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+    q2 = vmulq_s16(q2, q0);
+    q3 = vmulq_s16(q3, q1);
+    q4 = vmulq_s16(q4, q0);
+    q5 = vmulq_s16(q5, q1);
+
+    // vswp
+    dLow0 = vget_low_s16(q2);
+    dHigh0 = vget_high_s16(q2);
+    dLow1 = vget_low_s16(q4);
+    dHigh1 = vget_high_s16(q4);
+    q2 = vcombine_s16(dLow0, dLow1);
+    q4 = vcombine_s16(dHigh0, dHigh1);
+
+    dLow0 = vget_low_s16(q3);
+    dHigh0 = vget_high_s16(q3);
+    dLow1 = vget_low_s16(q5);
+    dHigh1 = vget_high_s16(q5);
+    q3 = vcombine_s16(dLow0, dLow1);
+    q5 = vcombine_s16(dHigh0, dHigh1);
+
+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+    q10 = vqaddq_s16(q2, q3);
+    q11 = vqsubq_s16(q2, q3);
+
+    q8 = vshrq_n_s16(q8, 1);
+    q9 = vshrq_n_s16(q9, 1);
+
+    q4 = vqaddq_s16(q4, q8);
+    q5 = vqaddq_s16(q5, q9);
+
+    q2 = vqsubq_s16(q6, q5);
+    q3 = vqaddq_s16(q7, q4);
+
+    q4 = vqaddq_s16(q10, q3);
+    q5 = vqaddq_s16(q11, q2);
+    q6 = vqsubq_s16(q11, q2);
+    q7 = vqsubq_s16(q10, q3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    // loop 2
+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+    q10 = vshrq_n_s16(q10, 1);
+    q11 = vshrq_n_s16(q11, 1);
+
+    q10 = vqaddq_s16(q2tmp2.val[1], q10);
+    q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+    q8 = vqsubq_s16(q8, q11);
+    q9 = vqaddq_s16(q9, q10);
+
+    q4 = vqaddq_s16(q2, q9);
+    q5 = vqaddq_s16(q3, q8);
+    q6 = vqsubq_s16(q3, q8);
+    q7 = vqsubq_s16(q2, q9);
+
+    q4 = vrshrq_n_s16(q4, 3);
+    q5 = vrshrq_n_s16(q5, 3);
+    q6 = vrshrq_n_s16(q6, 3);
+    q7 = vrshrq_n_s16(q7, 3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+                                          vreinterpret_u8_s32(d28)));
+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+                                          vreinterpret_u8_s32(d29)));
+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+                                          vreinterpret_u8_s32(d30)));
+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+                                          vreinterpret_u8_s32(d31)));
+
+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+    dst0 = dst;
+    dst1 = dst + 4;
+    vst1_lane_s32((int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    vst1_lane_s32((int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d31, 0);
+    vst1_lane_s32((int32_t *)dst1, d31, 1);
+    return;
+}
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
deleted file mode 100644
index c4f09c7..0000000
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,409 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
-    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #68]              ; load thresh
-    add         r12, r2, r1
-    add         r1, r1, r1
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vld1.u8     {q3}, [r2@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r2@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r2@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r2@128]                  ; q2
-    vld1.u8     {q10}, [r12@128]                ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r12, r12, r1, lsl #1
-
-    bl          vp8_loop_filter_neon
-
-    vst1.u8     {q5}, [r2@128], r1              ; store op1
-    vst1.u8     {q6}, [r12@128], r1             ; store op0
-    vst1.u8     {q7}, [r2@128], r1              ; store oq0
-    vst1.u8     {q8}, [r12@128], r1             ; store oq1
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                      ; duplicate blimit
-    vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #68]              ; load thresh
-    ldr         r2, [sp, #72]               ; load v ptr
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r3@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1             ; p3
-    vld1.u8     {d8}, [r3@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1             ; p2
-    vld1.u8     {d10}, [r3@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1            ; p1
-    vld1.u8     {d12}, [r3@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1            ; p0
-    vld1.u8     {d14}, [r3@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1            ; q0
-    vld1.u8     {d16}, [r3@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1            ; q1
-    vld1.u8     {d18}, [r3@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1            ; q2
-    vld1.u8     {d20}, [r3@64]                 ; q3
-    vld1.u8     {d21}, [r12@64]                ; q3
-
-    bl          vp8_loop_filter_neon
-
-    sub         r0, r0, r1, lsl #1
-    sub         r2, r2, r1, lsl #1
-
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r2@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r2@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64]                 ; store u oq1
-    vst1.u8     {d17}, [r2@64]                 ; store v oq1
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                           const signed char *flimit,
-;                                           const signed char *limit,
-;                                           const signed char *thresh,
-;                                           int count)
-; r0    unsigned char *src
-; r1    int pitch
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                     ; duplicate blimit
-    vdup.u8     q1, r3                     ; duplicate limit
-    sub         r2, r0, #4                 ; src ptr down by 4 columns
-    add         r1, r1, r1
-    ldr         r3, [sp, #68]              ; load thresh
-    add         r12, r2, r1, asr #1
-
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r12], r1
-
-    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d19}, [r2]
-    vld1.u8     {d21}, [r12]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r3                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp8_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-
-    sub         r0, r0, #2                 ; dst ptr
-
-    vswp        d14, d12
-    vswp        d16, d15
-
-    add         r12, r0, r1, asr #1
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-;                                            const signed char *flimit,
-;                                            const signed char *limit,
-;                                            const signed char *thresh,
-;                                            unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    vdup.u8     q0, r2                      ; duplicate blimit
-    sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #72]               ; load v ptr
-    vdup.u8     q1, r3                      ; duplicate limit
-    sub         r3, r2, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r12], r1             ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r12]
-    vld1.u8     {d21}, [r3]
-
-    ldr        r12, [sp, #68]              ; load thresh
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vdup.u8     q2, r12                     ; duplicate thresh
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    bl          vp8_loop_filter_neon
-
-    vswp        d12, d11
-    vswp        d16, d13
-    vswp        d14, d12
-    vswp        d16, d15
-
-    sub         r0, r0, #2
-    sub         r2, r2, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-; void vp8_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0    flimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-|vp8_loop_filter_neon| PROC
-
-    ; vp8_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q4
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3
-
-    vmov.u8     q10, #0x80                   ; 0x80
-
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-
-    vcge.u8     q15, q1, q15
-
-    ; vp8_filter() function
-    ; convert to signed
-    veor        q7, q7, q10                 ; qs0
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
-
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-
-    veor        q8, q8, q10                 ; qs1
-
-    vmov.u8     q10, #3                     ; #3
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
-
-    vmovl.u8    q4, d20
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; vp8_filter &= hev
-    vand        q15, q15, q9                ; vp8_filter_mask
-
-    vaddw.s8    q2, q2, d2
-    vaddw.s8    q11, q11, d3
-
-    vmov.u8     q9, #4                      ; #4
-
-    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
-
-
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
-    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
-
-    ; outer tap adjustments: ++vp8_filter >> 1
-    vrshr.s8    q1, q1, #1
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-    vmov.u8     q0, #0x80                   ; 0x80
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    veor        q5, q13, q0                 ; *op1 = u^0x80
-    veor        q8, q12, q0                 ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/loopfilter_neon.c b/vp8/common/arm/neon/loopfilter_neon.c
new file mode 100644
index 0000000..0bec7fb
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilter_neon.c
@@ -0,0 +1,549 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_neon(
+        uint8x16_t qblimit,  // flimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3    = vabdq_u8(q9, q8);
+    q4    = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3    = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q10 = vdupq_n_u8(3);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vmovl_u8(vget_low_u8(q10));
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    q0u8 = vdupq_n_u8(0x80);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    src -= (pitch * 5);
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6  = vld1_u8(u);
+    u += pitch;
+    d7  = vld1_u8(v);
+    v += pitch;
+    d8  = vld1_u8(u);
+    u += pitch;
+    d9  = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    u -= (pitch * 5);
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+
+    v -= (pitch * 5);
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+                             const uint8x8x4_t result) {
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+    vst4_lane_u8(dst, result, 0);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 1);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 2);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 3);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 4);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 5);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 6);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 7);
+#else
+    /*
+     * uint8x8x4_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    20 21 22 23 | 24 25 26 27
+    30 31 32 33 | 34 35 36 37
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 04 05 24 25
+    02 03 22 23 | 06 07 26 27
+    10 11 30 31 | 14 15 34 35
+    12 13 32 33 | 16 17 36 37
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 04 14 24 34
+    01 11 21 31 | 05 15 25 35
+    02 12 22 32 | 06 16 26 36
+    03 13 23 33 | 07 17 27 37
+    */
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+                                          vreinterpret_u16_u8(result.val[2]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+                                          vreinterpret_u16_u8(result.val[3]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#endif
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s, *d;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s = src - 4;
+    d6  = vld1_u8(s);
+    s += pitch;
+    d8  = vld1_u8(s);
+    s += pitch;
+    d10 = vld1_u8(s);
+    s += pitch;
+    d12 = vld1_u8(s);
+    s += pitch;
+    d14 = vld1_u8(s);
+    s += pitch;
+    d16 = vld1_u8(s);
+    s += pitch;
+    d18 = vld1_u8(s);
+    s += pitch;
+    d20 = vld1_u8(s);
+    s += pitch;
+    d7  = vld1_u8(s);
+    s += pitch;
+    d9  = vld1_u8(s);
+    s += pitch;
+    d11 = vld1_u8(s);
+    s += pitch;
+    d13 = vld1_u8(s);
+    s += pitch;
+    d15 = vld1_u8(s);
+    s += pitch;
+    d17 = vld1_u8(s);
+    s += pitch;
+    d19 = vld1_u8(s);
+    s += pitch;
+    d21 = vld1_u8(s);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+
+    d = src - 2;
+    write_4x8(d, pitch, q4ResultL);
+    d += pitch * 8;
+    write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d20 = vld1_u8(us);
+
+    vs = v - 4;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    ud = u - 2;
+    write_4x8(ud, pitch, q4ResultL);
+
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+    vd = v - 2;
+    write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
deleted file mode 100644
index 78d13c8..0000000
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_loop_filter_bvs_neon|
-    EXPORT |vp8_loop_filter_mbvs_neon|
-    ARM
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *s, PRESERVE
-; r1    int p, PRESERVE
-; q1    limit, PRESERVE
-
-|vp8_loop_filter_simple_vertical_edge_neon| PROC
-    vpush       {d8-d15}
-
-    sub         r0, r0, #2                  ; move src pointer down by 2 columns
-    add         r12, r1, r1
-    add         r3, r0, r1
-
-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
-    vswp        d7, d10
-    vswp        d12, d9
-
-    ;vp8_filter_mask() function
-    ;vp8_hevmask() function
-    sub         r0, r0, r1, lsl #4
-    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
-    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
-
-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
-    vmov.u8     q0, #0x80                   ; 0x80
-    vmov.s16    q11, #3
-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
-    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
-    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
-    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
-    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
-
-    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
-    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
-    vsubl.s8    q13, d9, d11
-
-    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
-    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
-    vmul.s16    q13, q13, q11
-
-    vmov.u8     q11, #0x03                  ; 0x03
-    vmov.u8     q12, #0x04                  ; 0x04
-
-    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d29
-
-    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d29, q13
-
-    add         r0, r0, #1
-    add         r3, r0, r1
-
-    vand        q14, q14, q15                 ; vp8_filter &= mask
-
-    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
-
-    ;calculate output
-    vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
-    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
-    veor        q6, q11, q0                 ; *op0 = u^0x80
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
-    add         r12, r1, r1
-    vswp        d13, d14
-
-    ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0], r12
-    vst2.8      {d12[1], d13[1]}, [r3], r12
-    vst2.8      {d12[2], d13[2]}, [r0], r12
-    vst2.8      {d12[3], d13[3]}, [r3], r12
-    vst2.8      {d12[4], d13[4]}, [r0], r12
-    vst2.8      {d12[5], d13[5]}, [r3], r12
-    vst2.8      {d12[6], d13[6]}, [r0], r12
-    vst2.8      {d12[7], d13[7]}, [r3], r12
-    vst2.8      {d14[0], d15[0]}, [r0], r12
-    vst2.8      {d14[1], d15[1]}, [r3], r12
-    vst2.8      {d14[2], d15[2]}, [r0], r12
-    vst2.8      {d14[3], d15[3]}, [r3], r12
-    vst2.8      {d14[4], d15[4]}, [r0], r12
-    vst2.8      {d14[5], d15[5]}, [r3], r12
-    vst2.8      {d14[6], d15[6]}, [r0], r12
-    vst2.8      {d14[7], d15[7]}, [r3]
-
-    vpop        {d8-d15}
-    bx          lr
-    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_bvs_neon| PROC
-    push        {r4, lr}
-    ldrb        r3, [r2]                   ; load blim from mem
-    mov         r4, r0
-    add         r0, r0, #4
-    vdup.s8     q1, r3                     ; duplicate blim
-    bl          vp8_loop_filter_simple_vertical_edge_neon
-    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
-    add         r0, r4, #8
-    bl          vp8_loop_filter_simple_vertical_edge_neon
-    add         r0, r4, #12
-    pop         {r4, lr}
-    b           vp8_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp8_loop_filter_bvs_neon|
-
-; r0    unsigned char *y
-; r1    int ystride
-; r2    const unsigned char *blimit
-
-|vp8_loop_filter_mbvs_neon| PROC
-    ldrb        r3, [r2]                   ; load mblim from mem
-    vdup.s8     q1, r3                     ; duplicate mblim
-    b           vp8_loop_filter_simple_vertical_edge_neon
-    ENDP        ;|vp8_loop_filter_bvs_neon|
-    END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000..d5178bb
--- /dev/null
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
+
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
+}
+#else
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result) {
+    /*
+     * uint8x8x2_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    ---
+    * after vtrn_u8
+    00 10 02 12 | 04 14 06 16
+    01 11 03 13 | 05 15 07 17
+    */
+    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+                                       result.val[1]);
+    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  write_2x4(dst, pitch, result);
+  dst += pitch * 8;
+  write_2x4(dst, pitch, result2);
+}
+#endif
+
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+    const uint8x8_t a = vld1_u8(src);
+    const uint8x8_t b = vld1_u8(src + pitch * 1);
+    const uint8x8_t c = vld1_u8(src + pitch * 2);
+    const uint8x8_t d = vld1_u8(src + pitch * 3);
+    const uint8x8_t e = vld1_u8(src + pitch * 4);
+    const uint8x8_t f = vld1_u8(src + pitch * 5);
+    const uint8x8_t g = vld1_u8(src + pitch * 6);
+    const uint8x8_t h = vld1_u8(src + pitch * 7);
+    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+                                          vreinterpret_u32_u8(e));
+    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+                                          vreinterpret_u32_u8(f));
+    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+                                          vreinterpret_u32_u8(g));
+    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+                                          vreinterpret_u32_u8(h));
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+                                          vreinterpret_u16_u32(r26_u32.val[0]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+                                          vreinterpret_u16_u32(r37_u32.val[0]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    /*
+     * after vtrn_u32
+    00 01 02 03 | 40 41 42 43
+    10 11 12 13 | 50 51 52 53
+    20 21 22 23 | 60 61 62 63
+    30 31 32 33 | 70 71 72 73
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 40 41 60 61
+    02 03 22 23 | 42 43 62 63
+    10 11 30 31 | 50 51 70 71
+    12 13 32 33 | 52 52 72 73
+
+    00 01 20 21 | 40 41 60 61
+    10 11 30 31 | 50 51 70 71
+    02 03 22 23 | 42 43 62 63
+    12 13 32 33 | 52 52 72 73
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 40 50 60 70
+    01 11 21 31 | 41 51 61 71
+    02 12 22 32 | 42 52 62 72
+    03 13 23 33 | 43 53 63 73
+    */
+    x.val[0] = r01_u8.val[0];
+    x.val[1] = r01_u8.val[1];
+    x.val[2] = r23_u8.val[0];
+    x.val[3] = r23_u8.val[1];
+
+    return x;
+}
+#endif
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+        unsigned char *s,
+        int p,
+        const unsigned char *blimit) {
+    unsigned char *src1;
+    uint8x16_t qblimit, q0u8;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+    int16x8_t q2s16, q13s16, q11s16;
+    int8x8_t d28s8, d29s8;
+    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
+    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
+    uint8x8x2_t d2u8x2;  // d12, d13
+    uint8x8x2_t d3u8x2;  // d14, d15
+
+    qblimit = vdupq_n_u8(*blimit);
+
+    src1 = s - 2;
+    d0u8x4 = read_4x8(src1, p, d0u8x4);
+    src1 += p * 8;
+    d1u8x4 = read_4x8(src1, p, d1u8x4);
+
+    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
+    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
+    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
+    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13
+
+    q15u8 = vabdq_u8(q5u8, q4u8);
+    q14u8 = vabdq_u8(q3u8, q6u8);
+
+    q15u8 = vqaddq_u8(q15u8, q15u8);
+    q14u8 = vshrq_n_u8(q14u8, 1);
+    q0u8 = vdupq_n_u8(0x80);
+    q11s16 = vdupq_n_s16(3);
+    q15u8 = vqaddq_u8(q15u8, q14u8);
+
+    q3u8 = veorq_u8(q3u8, q0u8);
+    q4u8 = veorq_u8(q4u8, q0u8);
+    q5u8 = veorq_u8(q5u8, q0u8);
+    q6u8 = veorq_u8(q6u8, q0u8);
+
+    q15u8 = vcgeq_u8(qblimit, q15u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+                      vreinterpretq_s8_u8(q6u8));
+
+    q2s16 = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q11u8 = vdupq_n_u8(3);
+    q12u8 = vdupq_n_u8(4);
+
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+    d28s8 = vqmovn_s16(q2s16);
+    d29s8 = vqmovn_s16(q13s16);
+    q14s8 = vcombine_s8(d28s8, d29s8);
+
+    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q14s8 = vshrq_n_s8(q3s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
+    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
+    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
+    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15
+
+    src1 = s - 1;
+    write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
diff --git a/vp8/common/arm/neon/reconintra_neon.c b/vp8/common/arm/neon/reconintra_neon.c
new file mode 100644
index 0000000..af52cd5
--- /dev/null
+++ b/vp8/common/arm/neon/reconintra_neon.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
+                                           unsigned char * yabove_row,
+                                           unsigned char * yleft,
+                                           int left_stride,
+                                           unsigned char * ypred_ptr,
+                                           int y_stride) {
+  const int mode = x->mode_info_context->mbmi.mode;
+  int i;
+
+  switch (mode) {
+    case DC_PRED:
+    {
+      int shift = x->up_available + x->left_available;
+      uint8x16_t v_expected_dc = vdupq_n_u8(128);
+
+      if (shift) {
+        unsigned int average = 0;
+        int expected_dc;
+        if (x->up_available) {
+          const uint8x16_t v_above = vld1q_u8(yabove_row);
+          const uint16x8_t a = vpaddlq_u8(v_above);
+          const uint32x4_t b = vpaddlq_u16(a);
+          const uint64x2_t c = vpaddlq_u32(b);
+          const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                        vreinterpret_u32_u64(vget_high_u64(c)));
+          average = vget_lane_u32(d, 0);
+        }
+        if (x->left_available) {
+          for (i = 0; i < 16; ++i) {
+              average += yleft[0];
+              yleft += left_stride;
+          }
+        }
+        shift += 3;
+        expected_dc = (average + (1 << (shift - 1))) >> shift;
+        v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
+      }
+      for (i = 0; i < 16; ++i) {
+        vst1q_u8(ypred_ptr, v_expected_dc);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case V_PRED:
+    {
+      const uint8x16_t v_above = vld1q_u8(yabove_row);
+      for (i = 0; i < 16; ++i) {
+        vst1q_u8(ypred_ptr, v_above);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case H_PRED:
+    {
+      for (i = 0; i < 16; ++i) {
+        const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
+        yleft += left_stride;
+        vst1q_u8(ypred_ptr, v_yleft);
+        ypred_ptr += y_stride;
+      }
+    }
+    break;
+    case TM_PRED:
+    {
+      const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
+      const uint8x16_t v_above = vld1q_u8(yabove_row);
+      for (i = 0; i < 16; ++i) {
+        const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
+        const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
+        const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
+        const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
+                                         vreinterpretq_s16_u16(v_ytop_left));
+        const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
+                                         vreinterpretq_s16_u16(v_ytop_left));
+        const uint8x8_t pred_lo = vqmovun_s16(b_lo);
+        const uint8x8_t pred_hi = vqmovun_s16(b_hi);
+
+        vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
+        ypred_ptr += y_stride;
+        yleft += left_stride;
+      }
+    }
+    break;
+  }
+}
+
+void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
+                                            unsigned char * uabove_row,
+                                            unsigned char * vabove_row,
+                                            unsigned char * uleft,
+                                            unsigned char * vleft,
+                                            int left_stride,
+                                            unsigned char * upred_ptr,
+                                            unsigned char * vpred_ptr,
+                                            int pred_stride) {
+  const int mode = x->mode_info_context->mbmi.uv_mode;
+  int i;
+
+  switch (mode) {
+    case DC_PRED:
+    {
+      int shift = x->up_available + x->left_available;
+      uint8x8_t v_expected_udc = vdup_n_u8(128);
+      uint8x8_t v_expected_vdc = vdup_n_u8(128);
+
+      if (shift) {
+        unsigned int average_u = 0;
+        unsigned int average_v = 0;
+        int expected_udc;
+        int expected_vdc;
+        if (x->up_available) {
+          const uint8x8_t v_uabove = vld1_u8(uabove_row);
+          const uint8x8_t v_vabove = vld1_u8(vabove_row);
+          const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
+          const uint32x4_t b = vpaddlq_u16(a);
+          const uint64x2_t c = vpaddlq_u32(b);
+          average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
+          average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
+        }
+        if (x->left_available) {
+          for (i = 0; i < 8; ++i) {
+              average_u += uleft[0];
+              uleft += left_stride;
+              average_v += vleft[0];
+              vleft += left_stride;
+          }
+        }
+        shift += 2;
+        expected_udc = (average_u + (1 << (shift - 1))) >> shift;
+        expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
+        v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
+        v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
+      }
+      for (i = 0; i < 8; ++i) {
+        vst1_u8(upred_ptr, v_expected_udc);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_expected_vdc);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case V_PRED:
+    {
+      const uint8x8_t v_uabove = vld1_u8(uabove_row);
+      const uint8x8_t v_vabove = vld1_u8(vabove_row);
+      for (i = 0; i < 8; ++i) {
+        vst1_u8(upred_ptr, v_uabove);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_vabove);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case H_PRED:
+    {
+      for (i = 0; i < 8; ++i) {
+        const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
+        const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
+        uleft += left_stride;
+        vleft += left_stride;
+        vst1_u8(upred_ptr, v_uleft);
+        upred_ptr += pred_stride;
+        vst1_u8(vpred_ptr, v_vleft);
+        vpred_ptr += pred_stride;
+      }
+    }
+    break;
+    case TM_PRED:
+    {
+      const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
+      const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
+      const uint8x8_t v_uabove = vld1_u8(uabove_row);
+      const uint8x8_t v_vabove = vld1_u8(vabove_row);
+      for (i = 0; i < 8; ++i) {
+        const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
+        const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
+        const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
+        const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
+        const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
+                                        vreinterpretq_s16_u16(v_utop_left));
+        const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
+                                        vreinterpretq_s16_u16(v_vtop_left));
+        const uint8x8_t pred_u = vqmovun_s16(b_u);
+        const uint8x8_t pred_v = vqmovun_s16(b_v);
+
+        vst1_u8(upred_ptr, pred_u);
+        vst1_u8(vpred_ptr, pred_v);
+        upred_ptr += pred_stride;
+        vpred_ptr += pred_stride;
+        uleft += left_stride;
+        vleft += left_stride;
+      }
+    }
+    break;
+  }
+}
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
deleted file mode 100644
index adc5b7e..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-;-----------------
-
-    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
-
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-|vp8_sub_pixel_variance16x16_neon_func| PROC
-    push            {r4-r6, lr}
-    vpush           {d8-d15}
-
-    adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #80]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #84]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #88]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16_only
-
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-
-    vld1.s32        {d31}, [r2]             ;load first_pass filter
-
-    beq             firstpass_bfilter16x16_only
-
-    sub             sp, sp, #272            ;reserve space on stack for temporary storage
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    mov             lr, sp
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    mov             r2, #3                  ;loop counter
-    vld1.u8         {d8, d9, d10}, [r0], r1
-
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    vdup.8          d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vqrshrn.u16    d21, q14, #7
-    vld1.u8         {d5, d6, d7}, [r0], r1
-
-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vst1.u8         {d18, d19, d20, d21}, [lr]!
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    bne             vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
-    vld1.u8         {d14, d15, d16}, [r0], r1
-
-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q10, d3, d0
-    vmull.u8        q11, d5, d0
-    vmull.u8        q12, d6, d0
-    vmull.u8        q13, d8, d0
-    vmull.u8        q14, d9, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-
-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q11, d5, d1
-    vmlal.u8        q13, d8, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-
-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q12, d6, d1
-    vmlal.u8        q14, d9, d1
-
-    vmull.u8        q1, d11, d0
-    vmull.u8        q2, d12, d0
-    vmull.u8        q3, d14, d0
-    vmull.u8        q4, d15, d0
-
-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
-    vext.8          d14, d14, d15, #1
-
-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q3, d14, d1
-
-    vext.8          d12, d12, d13, #1
-    vext.8          d15, d15, d16, #1
-
-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q4, d15, d1
-
-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d11, q10, #7
-    vqrshrn.u16    d12, q11, #7
-    vqrshrn.u16    d13, q12, #7
-    vqrshrn.u16    d14, q13, #7
-    vqrshrn.u16    d15, q14, #7
-    vqrshrn.u16    d16, q1, #7
-    vqrshrn.u16    d17, q2, #7
-    vqrshrn.u16    d18, q3, #7
-    vqrshrn.u16    d19, q4, #7
-
-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
-    vst1.u8         {d14, d15, d16, d17}, [lr]!
-    vst1.u8         {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
-    add             r3, r12, r3, lsl #3
-    sub             lr, lr, #272
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    sub             sp, sp, #256
-    mov             r3, sp
-
-    vld1.u8         {d22, d23}, [lr]!       ;load src data
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r12, #4                 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
-    vld1.u8         {d24, d25}, [lr]!
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [lr]!
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [lr]!
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [lr]!
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    subs            r12, r12, #1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5}, [r3]!
-    vst1.u8         {d6, d7}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_sp16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
-    mov             r2, #4                      ;loop counter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
-    vdup.8          d1, d31[4]
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
-    vld1.u8         {d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10}, [r0], r1
-    vld1.u8         {d11, d12, d13}, [r0], r1
-
-    pld             [r0]
-    pld             [r0, r1]
-    pld             [r0, r1, lsl #1]
-
-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q8, d3, d0
-    vmull.u8        q9, d5, d0
-    vmull.u8        q10, d6, d0
-    vmull.u8        q11, d8, d0
-    vmull.u8        q12, d9, d0
-    vmull.u8        q13, d11, d0
-    vmull.u8        q14, d12, d0
-
-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
-    vext.8          d5, d5, d6, #1
-    vext.8          d8, d8, d9, #1
-    vext.8          d11, d11, d12, #1
-
-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q9, d5, d1
-    vmlal.u8        q11, d8, d1
-    vmlal.u8        q13, d11, d1
-
-    vext.8          d3, d3, d4, #1
-    vext.8          d6, d6, d7, #1
-    vext.8          d9, d9, d10, #1
-    vext.8          d12, d12, d13, #1
-
-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
-    vmlal.u8        q10, d6, d1
-    vmlal.u8        q12, d9, d1
-    vmlal.u8        q14, d12, d1
-
-    subs            r2, r2, #1
-
-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d15, q8, #7
-    vqrshrn.u16    d16, q9, #7
-    vqrshrn.u16    d17, q10, #7
-    vqrshrn.u16    d18, q11, #7
-    vqrshrn.u16    d19, q12, #7
-    vqrshrn.u16    d20, q13, #7
-    vst1.u8         {d14, d15}, [r3]!       ;store result
-    vqrshrn.u16    d21, q14, #7
-
-    vst1.u8         {d16, d17}, [r3]!
-    vst1.u8         {d18, d19}, [r3]!
-    vst1.u8         {d20, d21}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
-    sub             sp, sp, #528            ;reserve space on stack for temporary storage
-    add             r3, r12, r3, lsl #3
-    mov             r12, #4                     ;loop counter
-    vld1.u32        {d31}, [r3]                 ;load second_pass filter
-    vld1.u8         {d22, d23}, [r0], r1        ;load src data
-    mov             r3, sp
-
-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
-    vld1.u8         {d24, d25}, [r0], r1
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vld1.u8         {d26, d27}, [r0], r1
-    vmull.u8        q2, d23, d0
-    vld1.u8         {d28, d29}, [r0], r1
-    vmull.u8        q3, d24, d0
-    vld1.u8         {d30, d31}, [r0], r1
-
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d25, d1
-    vmlal.u8        q3, d26, d1
-    vmlal.u8        q4, d27, d1
-    vmlal.u8        q5, d28, d1
-    vmlal.u8        q6, d29, d1
-    vmlal.u8        q7, d30, d1
-    vmlal.u8        q8, d31, d1
-
-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
-    vqrshrn.u16    d3, q2, #7
-    vqrshrn.u16    d4, q3, #7
-    vqrshrn.u16    d5, q4, #7
-    vqrshrn.u16    d6, q5, #7
-    vqrshrn.u16    d7, q6, #7
-    vqrshrn.u16    d8, q7, #7
-    vqrshrn.u16    d9, q8, #7
-
-    vst1.u8         {d2, d3}, [r3]!         ;store result
-    subs            r12, r12, #1
-    vst1.u8         {d4, d5}, [r3]!
-    vmov            q11, q15
-    vst1.u8         {d6, d7}, [r3]!
-    vst1.u8         {d8, d9}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16_loop_neon
-
-    b               sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r12, #8
-
-sub_pixel_variance16x16_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q2}, [r4], r5
-    vld1.8          {q1}, [r3]!
-    vld1.8          {q3}, [r4], r5
-
-    vsubl.u8        q11, d0, d4                 ;diff
-    vsubl.u8        q12, d1, d5
-    vsubl.u8        q13, d2, d6
-    vsubl.u8        q14, d3, d7
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    bne             sub_pixel_variance16x16_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [r6]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    add             sp, sp, #528
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4-r6,pc}
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
deleted file mode 100644
index b0829af..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,583 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
-    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
-    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_h_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_h_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #68]            ;load *sse from stack
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.8          {q11}, [r2], r3
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.8          {q12}, [r2], r3
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.8          {q13}, [r2], r3
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vext.8          q3, q2, q3, #1
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vld1.8          {q14}, [r2], r3
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-
-    vsubl.u8        q4, d0, d22                 ;diff
-    vsubl.u8        q5, d1, d23
-    vsubl.u8        q6, d2, d24
-    vsubl.u8        q7, d3, d25
-    vsubl.u8        q0, d4, d26
-    vsubl.u8        q1, d5, d27
-    vsubl.u8        q2, d6, d28
-    vsubl.u8        q3, d7, d29
-
-    vpadal.s16      q8, q4                     ;sum
-    vmlal.s16       q9, d8, d8                ;sse
-    vmlal.s16       q10, d9, d9
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_fpo16x16s_4_0_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_v_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_v_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    mov             r12, #4                     ;loop counter
-
-    vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #68]               ;load *sse from stack
-
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
-    vld1.u8         {q2}, [r0], r1
-    vld1.8          {q1}, [r2], r3
-    vld1.u8         {q4}, [r0], r1
-    vld1.8          {q3}, [r2], r3
-    vld1.u8         {q6}, [r0], r1
-    vld1.8          {q5}, [r2], r3
-    vld1.u8         {q15}, [r0], r1
-
-    vrhadd.u8       q0, q0, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q4
-    vrhadd.u8       q4, q4, q6
-    vrhadd.u8       q6, q6, q15
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                 ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-
-    vmov            q0, q15
-
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             vp8_filt_spo16x16s_0_4_loop_neon
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_hv_neon
-;(
-;    unsigned char  *src_ptr, r0
-;    int  src_pixels_per_line,  r1
-;    unsigned char *dst_ptr,  r2
-;    int dst_pixels_per_line,   r3
-;    unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_hv_neon| PROC
-    push            {lr}
-    vpush           {d8-d15}
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-
-    ldr             lr, [sp, #68]           ;load *sse from stack
-    vmov.i8         q13, #0                      ;q8 - sum
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-
-    vmov.i8         q14, #0                      ;q9, q10 - sse
-    vmov.i8         q15, #0
-
-    mov             r12, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vld1.8          {q5}, [r2], r3
-    vrhadd.u8       q0, q0, q1
-    vld1.8          {q6}, [r2], r3
-    vrhadd.u8       q1, q1, q2
-    vld1.8          {q7}, [r2], r3
-    vrhadd.u8       q2, q2, q3
-    vld1.8          {q8}, [r2], r3
-    vrhadd.u8       q3, q3, q4
-
-    vsubl.u8        q9, d0, d10                 ;diff
-    vsubl.u8        q10, d1, d11
-    vsubl.u8        q11, d2, d12
-    vsubl.u8        q12, d3, d13
-
-    vsubl.u8        q0, d4, d14                 ;diff
-    vsubl.u8        q1, d5, d15
-    vsubl.u8        q5, d6, d16
-    vsubl.u8        q6, d7, d17
-
-    vpadal.s16      q13, q9                     ;sum
-    vmlal.s16       q14, d18, d18                ;sse
-    vmlal.s16       q15, d19, d19
-
-    vpadal.s16      q13, q10                     ;sum
-    vmlal.s16       q14, d20, d20                ;sse
-    vmlal.s16       q15, d21, d21
-
-    vpadal.s16      q13, q11                     ;sum
-    vmlal.s16       q14, d22, d22                ;sse
-    vmlal.s16       q15, d23, d23
-
-    vpadal.s16      q13, q12                     ;sum
-    vmlal.s16       q14, d24, d24                ;sse
-    vmlal.s16       q15, d25, d25
-
-    subs            r12, r12, #1
-
-    vpadal.s16      q13, q0                     ;sum
-    vmlal.s16       q14, d0, d0                ;sse
-    vmlal.s16       q15, d1, d1
-
-    vpadal.s16      q13, q1                     ;sum
-    vmlal.s16       q14, d2, d2                ;sse
-    vmlal.s16       q15, d3, d3
-
-    vpadal.s16      q13, q5                     ;sum
-    vmlal.s16       q14, d10, d10                ;sse
-    vmlal.s16       q15, d11, d11
-
-    vmov            q0, q4
-
-    vpadal.s16      q13, q6                     ;sum
-    vmlal.s16       q14, d12, d12                ;sse
-    vmlal.s16       q15, d13, d13
-
-    bne             vp8_filt16x16s_4_4_loop_neon
-
-    vadd.u32        q15, q14, q15                ;accumulate sse
-    vpaddl.s32      q0, q13                      ;accumulate sum
-
-    vpaddl.u32      q1, q15
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {pc}
-    ENDP
-
-;==============================
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp8_sub_pixel_variance16x16s_neon| PROC
-    push            {r4, lr}
-    vpush           {d8-d15}
-
-    ldr             r4, [sp, #72]           ;load *dst_ptr from stack
-    ldr             r12, [sp, #76]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #80]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             secondpass_bfilter16x16s_only
-
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    beq             firstpass_bfilter16x16s_only
-
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    mov             r3, sp
-    mov             r2, #4                  ;loop counter
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
-    vext.8          q5, q4, q5, #1
-    vext.8          q7, q6, q7, #1
-    vext.8          q9, q8, q9, #1
-
-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-
-    vrhadd.u8       q0, q0, q1
-    vrhadd.u8       q1, q1, q2
-    vrhadd.u8       q2, q2, q3
-    vrhadd.u8       q3, q3, q4
-
-    subs            r2, r2, #1
-    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
-    vmov            q0, q4
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-
-    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
-    mov             r2, #2                  ;loop counter
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-    mov             r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
-    vld1.u8         {d4, d5, d6, d7}, [r0], r1
-    vld1.u8         {d8, d9, d10, d11}, [r0], r1
-    vld1.u8         {d12, d13, d14, d15}, [r0], r1
-
-    ;pld                [r0]
-    ;pld                [r0, r1]
-    ;pld                [r0, r1, lsl #1]
-
-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
-    vld1.u8         {d16, d17, d18, d19}, [r0], r1
-    vext.8          q3, q2, q3, #1
-    vld1.u8         {d20, d21, d22, d23}, [r0], r1
-    vext.8          q5, q4, q5, #1
-    vld1.u8         {d24, d25, d26, d27}, [r0], r1
-    vext.8          q7, q6, q7, #1
-    vld1.u8         {d28, d29, d30, d31}, [r0], r1
-    vext.8          q9, q8, q9, #1
-    vext.8          q11, q10, q11, #1
-    vext.8          q13, q12, q13, #1
-    vext.8          q15, q14, q15, #1
-
-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-    vrhadd.u8       q1, q2, q3
-    vrhadd.u8       q2, q4, q5
-    vrhadd.u8       q3, q6, q7
-    vrhadd.u8       q4, q8, q9
-    vrhadd.u8       q5, q10, q11
-    vrhadd.u8       q6, q12, q13
-    vrhadd.u8       q7, q14, q15
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
-    sub             sp, sp, #256            ;reserve space on stack for temporary storage
-
-    mov             r2, #2                  ;loop counter
-    vld1.u8         {d0, d1}, [r0], r1      ;load src data
-    mov             r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
-    vld1.u8         {d2, d3}, [r0], r1
-    vld1.u8         {d4, d5}, [r0], r1
-    vld1.u8         {d6, d7}, [r0], r1
-    vld1.u8         {d8, d9}, [r0], r1
-
-    vrhadd.u8       q0, q0, q1
-    vld1.u8         {d10, d11}, [r0], r1
-    vrhadd.u8       q1, q1, q2
-    vld1.u8         {d12, d13}, [r0], r1
-    vrhadd.u8       q2, q2, q3
-    vld1.u8         {d14, d15}, [r0], r1
-    vrhadd.u8       q3, q3, q4
-    vld1.u8         {d16, d17}, [r0], r1
-    vrhadd.u8       q4, q4, q5
-    vrhadd.u8       q5, q5, q6
-    vrhadd.u8       q6, q6, q7
-    vrhadd.u8       q7, q7, q8
-
-    subs            r2, r2, #1
-
-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
-    vmov            q0, q8
-    vst1.u8         {d4, d5, d6, d7}, [r3]!
-    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
-    vst1.u8         {d12, d13, d14, d15}, [r3]!
-
-    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
-
-    b               sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    sub             r3, r3, #256
-    mov             r2, #4
-
-sub_pixel_variance16x16s_neon_loop
-    vld1.8          {q0}, [r3]!                 ;Load up source and reference
-    vld1.8          {q1}, [r4], r12
-    vld1.8          {q2}, [r3]!
-    vld1.8          {q3}, [r4], r12
-    vld1.8          {q4}, [r3]!
-    vld1.8          {q5}, [r4], r12
-    vld1.8          {q6}, [r3]!
-    vld1.8          {q7}, [r4], r12
-
-    vsubl.u8        q11, d0, d2                 ;diff
-    vsubl.u8        q12, d1, d3
-    vsubl.u8        q13, d4, d6
-    vsubl.u8        q14, d5, d7
-    vsubl.u8        q0, d8, d10
-    vsubl.u8        q1, d9, d11
-    vsubl.u8        q2, d12, d14
-    vsubl.u8        q3, d13, d15
-
-    vpadal.s16      q8, q11                     ;sum
-    vmlal.s16       q9, d22, d22                ;sse
-    vmlal.s16       q10, d23, d23
-
-    subs            r2, r2, #1
-
-    vpadal.s16      q8, q12
-    vmlal.s16       q9, d24, d24
-    vmlal.s16       q10, d25, d25
-    vpadal.s16      q8, q13
-    vmlal.s16       q9, d26, d26
-    vmlal.s16       q10, d27, d27
-    vpadal.s16      q8, q14
-    vmlal.s16       q9, d28, d28
-    vmlal.s16       q10, d29, d29
-
-    vpadal.s16      q8, q0                     ;sum
-    vmlal.s16       q9, d0, d0                ;sse
-    vmlal.s16       q10, d1, d1
-    vpadal.s16      q8, q1
-    vmlal.s16       q9, d2, d2
-    vmlal.s16       q10, d3, d3
-    vpadal.s16      q8, q2
-    vmlal.s16       q9, d4, d4
-    vmlal.s16       q10, d5, d5
-    vpadal.s16      q8, q3
-    vmlal.s16       q9, d6, d6
-    vmlal.s16       q10, d7, d7
-
-    bne             sub_pixel_variance16x16s_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #8
-    vsub.u32        d0, d1, d10
-
-    add             sp, sp, #256
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4, pc}
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
deleted file mode 100644
index 9d9f9e0..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_sub_pixel_variance8x8_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    unsigned char  *src_ptr,
-; r1    int  src_pixels_per_line,
-; r2    int  xoffset,
-; r3    int  yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
-
-|vp8_sub_pixel_variance8x8_neon| PROC
-    push            {r4-r5, lr}
-    vpush           {d8-d15}
-
-    adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #76]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #80]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #84]           ;load *sse from stack
-
-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
-    beq             skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
-    add             r2, r12, r2, lsl #3     ;calculate filter location
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vld1.u32        {d31}, [r2]             ;load first_pass filter
-    vld1.u8         {q2}, [r0], r1
-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
-    vld1.u8         {q3}, [r0], r1
-    vdup.8          d1, d31[4]
-    vld1.u8         {q4}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-
-    vld1.u8         {q1}, [r0], r1          ;load src data
-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
-    vld1.u8         {q2}, [r0], r1
-    vqrshrn.u16    d23, q7, #7
-    vld1.u8         {q3}, [r0], r1
-    vqrshrn.u16    d24, q8, #7
-    vld1.u8         {q4}, [r0], r1
-    vqrshrn.u16    d25, q9, #7
-
-    ;first_pass filtering on the rest 5-line data
-    vld1.u8         {q5}, [r0], r1
-
-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
-    vmull.u8        q7, d4, d0
-    vmull.u8        q8, d6, d0
-    vmull.u8        q9, d8, d0
-    vmull.u8        q10, d10, d0
-
-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
-    vext.8          d5, d4, d5, #1
-    vext.8          d7, d6, d7, #1
-    vext.8          d9, d8, d9, #1
-    vext.8          d11, d10, d11, #1
-
-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
-    vmlal.u8        q7, d5, d1
-    vmlal.u8        q8, d7, d1
-    vmlal.u8        q9, d9, d1
-    vmlal.u8        q10, d11, d1
-
-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d27, q7, #7
-    vqrshrn.u16    d28, q8, #7
-    vqrshrn.u16    d29, q9, #7
-    vqrshrn.u16    d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
-    ;skip_secondpass_filter
-    beq             sub_pixel_variance8x8_neon
-
-    add             r3, r12, r3, lsl #3
-
-    vld1.u32        {d31}, [r3]             ;load second_pass filter
-
-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
-    vdup.8          d1, d31[4]
-
-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
-    vmull.u8        q2, d23, d0
-    vmull.u8        q3, d24, d0
-    vmull.u8        q4, d25, d0
-    vmull.u8        q5, d26, d0
-    vmull.u8        q6, d27, d0
-    vmull.u8        q7, d28, d0
-    vmull.u8        q8, d29, d0
-
-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
-    vmlal.u8        q2, d24, d1
-    vmlal.u8        q3, d25, d1
-    vmlal.u8        q4, d26, d1
-    vmlal.u8        q5, d27, d1
-    vmlal.u8        q6, d28, d1
-    vmlal.u8        q7, d29, d1
-    vmlal.u8        q8, d30, d1
-
-    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
-    vqrshrn.u16    d23, q2, #7
-    vqrshrn.u16    d24, q3, #7
-    vqrshrn.u16    d25, q4, #7
-    vqrshrn.u16    d26, q5, #7
-    vqrshrn.u16    d27, q6, #7
-    vqrshrn.u16    d28, q7, #7
-    vqrshrn.u16    d29, q8, #7
-
-    b               sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
-    vld1.u8         {d22}, [r0], r1         ;load src data
-    vld1.u8         {d23}, [r0], r1
-    vld1.u8         {d24}, [r0], r1
-    vld1.u8         {d25}, [r0], r1
-    vld1.u8         {d26}, [r0], r1
-    vld1.u8         {d27}, [r0], r1
-    vld1.u8         {d28}, [r0], r1
-    vld1.u8         {d29}, [r0], r1
-    vld1.u8         {d30}, [r0], r1
-
-    b               secondpass_filter
-
-;----------------------
-;vp8_variance8x8_neon
-sub_pixel_variance8x8_neon
-    vmov.i8         q8, #0                      ;q8 - sum
-    vmov.i8         q9, #0                      ;q9, q10 - sse
-    vmov.i8         q10, #0
-
-    mov             r12, #2
-
-sub_pixel_variance8x8_neon_loop
-    vld1.8          {d0}, [r4], r5              ;load dst data
-    subs            r12, r12, #1
-    vld1.8          {d1}, [r4], r5
-    vld1.8          {d2}, [r4], r5
-    vsubl.u8        q4, d22, d0                 ;calculate diff
-    vld1.8          {d3}, [r4], r5
-
-    vsubl.u8        q5, d23, d1
-    vsubl.u8        q6, d24, d2
-
-    vpadal.s16      q8, q4                      ;sum
-    vmlal.s16       q9, d8, d8                  ;sse
-    vmlal.s16       q10, d9, d9
-
-    vsubl.u8        q7, d25, d3
-
-    vpadal.s16      q8, q5
-    vmlal.s16       q9, d10, d10
-    vmlal.s16       q10, d11, d11
-
-    vmov            q11, q13
-
-    vpadal.s16      q8, q6
-    vmlal.s16       q9, d12, d12
-    vmlal.s16       q10, d13, d13
-
-    vmov            q12, q14
-
-    vpadal.s16      q8, q7
-    vmlal.s16       q9, d14, d14
-    vmlal.s16       q10, d15, d15
-
-    bne             sub_pixel_variance8x8_neon_loop
-
-    vadd.u32        q10, q9, q10                ;accumulate sse
-    vpaddl.s32      q0, q8                      ;accumulate sum
-
-    vpaddl.u32      q1, q10
-    vadd.s64        d0, d0, d1
-    vadd.u64        d1, d2, d3
-
-    vmull.s32       q5, d0, d0
-    vst1.32         {d1[0]}, [lr]               ;store sse
-    vshr.u32        d10, d10, #6
-    vsub.u32        d0, d1, d10
-
-    vmov.32         r0, d0[0]                   ;return
-
-    vpop            {d8-d15}
-    pop             {r4-r5, pc}
-
-    ENDP
-
-;-----------------
-
-bilinear_taps_coeff
-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-    END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
new file mode 100644
index 0000000..6405bf2
--- /dev/null
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -0,0 +1,1028 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+static const uint16_t bilinear_taps_coeff[8][2] = {
+    {128,   0},
+    {112,  16},
+    { 96,  32},
+    { 80,  48},
+    { 64,  64},
+    { 48,  80},
+    { 32,  96},
+    { 16, 112}
+};
+
+unsigned int vp8_sub_pixel_variance16x16_neon_func(
+        const unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        const unsigned char *dst_ptr,
+        int dst_pixels_per_line,
+        unsigned int *sse) {
+    int i;
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+    unsigned char *tmpp;
+    unsigned char *tmpp2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+    uint8x8_t d19u8, d20u8, d21u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64, d2s64, d3s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    tmpp2 = tmp + 272;
+    tmpp = tmp;
+    if (xoffset == 0) {  // secondpass_bfilter16x16_only
+        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+        q11u8 = vld1q_u8(src_ptr);
+        src_ptr += src_pixels_per_line;
+        for (i = 4; i > 0; i--) {
+            q12u8 = vld1q_u8(src_ptr);
+            src_ptr += src_pixels_per_line;
+            q13u8 = vld1q_u8(src_ptr);
+            src_ptr += src_pixels_per_line;
+            q14u8 = vld1q_u8(src_ptr);
+            src_ptr += src_pixels_per_line;
+            q15u8 = vld1q_u8(src_ptr);
+            src_ptr += src_pixels_per_line;
+
+            __builtin_prefetch(src_ptr);
+            __builtin_prefetch(src_ptr + src_pixels_per_line);
+            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+            d2u8 = vqrshrn_n_u16(q1u16, 7);
+            d3u8 = vqrshrn_n_u16(q2u16, 7);
+            d4u8 = vqrshrn_n_u16(q3u16, 7);
+            d5u8 = vqrshrn_n_u16(q4u16, 7);
+            d6u8 = vqrshrn_n_u16(q5u16, 7);
+            d7u8 = vqrshrn_n_u16(q6u16, 7);
+            d8u8 = vqrshrn_n_u16(q7u16, 7);
+            d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            q2u8 = vcombine_u8(d4u8, d5u8);
+            q3u8 = vcombine_u8(d6u8, d7u8);
+            q4u8 = vcombine_u8(d8u8, d9u8);
+
+            q11u8 = q15u8;
+
+            vst1q_u8((uint8_t *)tmpp2, q1u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q2u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q3u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q4u8);
+            tmpp2 += 16;
+        }
+    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
+        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+        for (i = 4; i > 0 ; i--) {
+            d2u8 = vld1_u8(src_ptr);
+            d3u8 = vld1_u8(src_ptr + 8);
+            d4u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d5u8 = vld1_u8(src_ptr);
+            d6u8 = vld1_u8(src_ptr + 8);
+            d7u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d8u8 = vld1_u8(src_ptr);
+            d9u8 = vld1_u8(src_ptr + 8);
+            d10u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d11u8 = vld1_u8(src_ptr);
+            d12u8 = vld1_u8(src_ptr + 8);
+            d13u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+
+            __builtin_prefetch(src_ptr);
+            __builtin_prefetch(src_ptr + src_pixels_per_line);
+            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+            q7u16  = vmull_u8(d2u8, d0u8);
+            q8u16  = vmull_u8(d3u8, d0u8);
+            q9u16  = vmull_u8(d5u8, d0u8);
+            q10u16 = vmull_u8(d6u8, d0u8);
+            q11u16 = vmull_u8(d8u8, d0u8);
+            q12u16 = vmull_u8(d9u8, d0u8);
+            q13u16 = vmull_u8(d11u8, d0u8);
+            q14u16 = vmull_u8(d12u8, d0u8);
+
+            d2u8  = vext_u8(d2u8, d3u8, 1);
+            d5u8  = vext_u8(d5u8, d6u8, 1);
+            d8u8  = vext_u8(d8u8, d9u8, 1);
+            d11u8 = vext_u8(d11u8, d12u8, 1);
+
+            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
+            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
+            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+            d3u8  = vext_u8(d3u8, d4u8, 1);
+            d6u8  = vext_u8(d6u8, d7u8, 1);
+            d9u8  = vext_u8(d9u8, d10u8, 1);
+            d12u8 = vext_u8(d12u8, d13u8, 1);
+
+            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
+            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+            d14u8 = vqrshrn_n_u16(q7u16, 7);
+            d15u8 = vqrshrn_n_u16(q8u16, 7);
+            d16u8 = vqrshrn_n_u16(q9u16, 7);
+            d17u8 = vqrshrn_n_u16(q10u16, 7);
+            d18u8 = vqrshrn_n_u16(q11u16, 7);
+            d19u8 = vqrshrn_n_u16(q12u16, 7);
+            d20u8 = vqrshrn_n_u16(q13u16, 7);
+            d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+            q7u8  = vcombine_u8(d14u8, d15u8);
+            q8u8  = vcombine_u8(d16u8, d17u8);
+            q9u8  = vcombine_u8(d18u8, d19u8);
+            q10u8 = vcombine_u8(d20u8, d21u8);
+
+            vst1q_u8((uint8_t *)tmpp2, q7u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q8u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q9u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q10u8);
+            tmpp2 += 16;
+        }
+    } else {
+        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+        d2u8 = vld1_u8(src_ptr);
+        d3u8 = vld1_u8(src_ptr + 8);
+        d4u8 = vld1_u8(src_ptr + 16);
+        src_ptr += src_pixels_per_line;
+        d5u8 = vld1_u8(src_ptr);
+        d6u8 = vld1_u8(src_ptr + 8);
+        d7u8 = vld1_u8(src_ptr + 16);
+        src_ptr += src_pixels_per_line;
+        d8u8 = vld1_u8(src_ptr);
+        d9u8 = vld1_u8(src_ptr + 8);
+        d10u8 = vld1_u8(src_ptr + 16);
+        src_ptr += src_pixels_per_line;
+        d11u8 = vld1_u8(src_ptr);
+        d12u8 = vld1_u8(src_ptr + 8);
+        d13u8 = vld1_u8(src_ptr + 16);
+        src_ptr += src_pixels_per_line;
+
+        // First Pass: output_height lines x output_width columns (17x16)
+        for (i = 3; i > 0; i--) {
+            q7u16  = vmull_u8(d2u8, d0u8);
+            q8u16  = vmull_u8(d3u8, d0u8);
+            q9u16  = vmull_u8(d5u8, d0u8);
+            q10u16 = vmull_u8(d6u8, d0u8);
+            q11u16 = vmull_u8(d8u8, d0u8);
+            q12u16 = vmull_u8(d9u8, d0u8);
+            q13u16 = vmull_u8(d11u8, d0u8);
+            q14u16 = vmull_u8(d12u8, d0u8);
+
+            d2u8  = vext_u8(d2u8, d3u8, 1);
+            d5u8  = vext_u8(d5u8, d6u8, 1);
+            d8u8  = vext_u8(d8u8, d9u8, 1);
+            d11u8 = vext_u8(d11u8, d12u8, 1);
+
+            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
+            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
+            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+            d3u8  = vext_u8(d3u8, d4u8, 1);
+            d6u8  = vext_u8(d6u8, d7u8, 1);
+            d9u8  = vext_u8(d9u8, d10u8, 1);
+            d12u8 = vext_u8(d12u8, d13u8, 1);
+
+            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
+            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+            d14u8 = vqrshrn_n_u16(q7u16, 7);
+            d15u8 = vqrshrn_n_u16(q8u16, 7);
+            d16u8 = vqrshrn_n_u16(q9u16, 7);
+            d17u8 = vqrshrn_n_u16(q10u16, 7);
+            d18u8 = vqrshrn_n_u16(q11u16, 7);
+            d19u8 = vqrshrn_n_u16(q12u16, 7);
+            d20u8 = vqrshrn_n_u16(q13u16, 7);
+            d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+            d2u8 = vld1_u8(src_ptr);
+            d3u8 = vld1_u8(src_ptr + 8);
+            d4u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d5u8 = vld1_u8(src_ptr);
+            d6u8 = vld1_u8(src_ptr + 8);
+            d7u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d8u8 = vld1_u8(src_ptr);
+            d9u8 = vld1_u8(src_ptr + 8);
+            d10u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+            d11u8 = vld1_u8(src_ptr);
+            d12u8 = vld1_u8(src_ptr + 8);
+            d13u8 = vld1_u8(src_ptr + 16);
+            src_ptr += src_pixels_per_line;
+
+            q7u8 = vcombine_u8(d14u8, d15u8);
+            q8u8 = vcombine_u8(d16u8, d17u8);
+            q9u8 = vcombine_u8(d18u8, d19u8);
+            q10u8 = vcombine_u8(d20u8, d21u8);
+
+            vst1q_u8((uint8_t *)tmpp, q7u8);
+            tmpp += 16;
+            vst1q_u8((uint8_t *)tmpp, q8u8);
+            tmpp += 16;
+            vst1q_u8((uint8_t *)tmpp, q9u8);
+            tmpp += 16;
+            vst1q_u8((uint8_t *)tmpp, q10u8);
+            tmpp += 16;
+        }
+
+        // First-pass filtering for rest 5 lines
+        d14u8 = vld1_u8(src_ptr);
+        d15u8 = vld1_u8(src_ptr + 8);
+        d16u8 = vld1_u8(src_ptr + 16);
+        src_ptr += src_pixels_per_line;
+
+        q9u16  = vmull_u8(d2u8, d0u8);
+        q10u16 = vmull_u8(d3u8, d0u8);
+        q11u16 = vmull_u8(d5u8, d0u8);
+        q12u16 = vmull_u8(d6u8, d0u8);
+        q13u16 = vmull_u8(d8u8, d0u8);
+        q14u16 = vmull_u8(d9u8, d0u8);
+
+        d2u8  = vext_u8(d2u8, d3u8, 1);
+        d5u8  = vext_u8(d5u8, d6u8, 1);
+        d8u8  = vext_u8(d8u8, d9u8, 1);
+
+        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
+        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+        d3u8  = vext_u8(d3u8, d4u8, 1);
+        d6u8  = vext_u8(d6u8, d7u8, 1);
+        d9u8  = vext_u8(d9u8, d10u8, 1);
+
+        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+        q1u16 = vmull_u8(d11u8, d0u8);
+        q2u16 = vmull_u8(d12u8, d0u8);
+        q3u16 = vmull_u8(d14u8, d0u8);
+        q4u16 = vmull_u8(d15u8, d0u8);
+
+        d11u8 = vext_u8(d11u8, d12u8, 1);
+        d14u8 = vext_u8(d14u8, d15u8, 1);
+
+        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+        d12u8 = vext_u8(d12u8, d13u8, 1);
+        d15u8 = vext_u8(d15u8, d16u8, 1);
+
+        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+        d10u8 = vqrshrn_n_u16(q9u16, 7);
+        d11u8 = vqrshrn_n_u16(q10u16, 7);
+        d12u8 = vqrshrn_n_u16(q11u16, 7);
+        d13u8 = vqrshrn_n_u16(q12u16, 7);
+        d14u8 = vqrshrn_n_u16(q13u16, 7);
+        d15u8 = vqrshrn_n_u16(q14u16, 7);
+        d16u8 = vqrshrn_n_u16(q1u16, 7);
+        d17u8 = vqrshrn_n_u16(q2u16, 7);
+        d18u8 = vqrshrn_n_u16(q3u16, 7);
+        d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+        q5u8 = vcombine_u8(d10u8, d11u8);
+        q6u8 = vcombine_u8(d12u8, d13u8);
+        q7u8 = vcombine_u8(d14u8, d15u8);
+        q8u8 = vcombine_u8(d16u8, d17u8);
+        q9u8 = vcombine_u8(d18u8, d19u8);
+
+        vst1q_u8((uint8_t *)tmpp, q5u8);
+        tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q6u8);
+        tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q7u8);
+        tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q8u8);
+        tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q9u8);
+
+        // secondpass_filter
+        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+        tmpp = tmp;
+        tmpp2 = tmpp + 272;
+        q11u8 = vld1q_u8(tmpp);
+        tmpp += 16;
+        for (i = 4; i > 0; i--) {
+            q12u8 = vld1q_u8(tmpp);
+            tmpp += 16;
+            q13u8 = vld1q_u8(tmpp);
+            tmpp += 16;
+            q14u8 = vld1q_u8(tmpp);
+            tmpp += 16;
+            q15u8 = vld1q_u8(tmpp);
+            tmpp += 16;
+
+            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+            d2u8 = vqrshrn_n_u16(q1u16, 7);
+            d3u8 = vqrshrn_n_u16(q2u16, 7);
+            d4u8 = vqrshrn_n_u16(q3u16, 7);
+            d5u8 = vqrshrn_n_u16(q4u16, 7);
+            d6u8 = vqrshrn_n_u16(q5u16, 7);
+            d7u8 = vqrshrn_n_u16(q6u16, 7);
+            d8u8 = vqrshrn_n_u16(q7u16, 7);
+            d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            q2u8 = vcombine_u8(d4u8, d5u8);
+            q3u8 = vcombine_u8(d6u8, d7u8);
+            q4u8 = vcombine_u8(d8u8, d9u8);
+
+            q11u8 = q15u8;
+
+            vst1q_u8((uint8_t *)tmpp2, q1u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q2u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q3u8);
+            tmpp2 += 16;
+            vst1q_u8((uint8_t *)tmpp2, q4u8);
+            tmpp2 += 16;
+        }
+    }
+
+    // sub_pixel_variance16x16_neon
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    tmpp = tmp + 272;
+    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
+        q0u8 = vld1q_u8(tmpp);
+        tmpp += 16;
+        q1u8 = vld1q_u8(tmpp);
+        tmpp += 16;
+        q2u8 = vld1q_u8(dst_ptr);
+        dst_ptr += dst_pixels_per_line;
+        q3u8 = vld1q_u8(dst_ptr);
+        dst_ptr += dst_pixels_per_line;
+
+        d0u8 = vget_low_u8(q0u8);
+        d1u8 = vget_high_u8(q0u8);
+        d2u8 = vget_low_u8(q1u8);
+        d3u8 = vget_high_u8(q1u8);
+
+        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vget_low_s64(q0s64);
+    d1s64 = vget_high_s64(q0s64);
+    d2s64 = vget_low_s64(q1s64);
+    d3s64 = vget_high_s64(q1s64);
+    d0s64 = vadd_s64(d0s64, d1s64);
+    d1s64 = vadd_s64(d2s64, d3s64);
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_h_neon(
+        const unsigned char *src_ptr,
+        int  source_stride,
+        const unsigned char *ref_ptr,
+        int  recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64, d2s64, d3s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
+    uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
+    uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
+        q0u8 = vld1q_u8(src_ptr);
+        q1u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q2u8 = vld1q_u8(src_ptr);
+        q3u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q4u8 = vld1q_u8(src_ptr);
+        q5u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q6u8 = vld1q_u8(src_ptr);
+        q7u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+
+        q11u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q12u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q13u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q14u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q1u8 = vextq_u8(q0u8, q1u8, 1);
+        q3u8 = vextq_u8(q2u8, q3u8, 1);
+        q5u8 = vextq_u8(q4u8, q5u8, 1);
+        q7u8 = vextq_u8(q6u8, q7u8, 1);
+
+        q0u8 = vrhaddq_u8(q0u8, q1u8);
+        q1u8 = vrhaddq_u8(q2u8, q3u8);
+        q2u8 = vrhaddq_u8(q4u8, q5u8);
+        q3u8 = vrhaddq_u8(q6u8, q7u8);
+
+        d0u8 = vget_low_u8(q0u8);
+        d1u8 = vget_high_u8(q0u8);
+        d2u8 = vget_low_u8(q1u8);
+        d3u8 = vget_high_u8(q1u8);
+        d4u8 = vget_low_u8(q2u8);
+        d5u8 = vget_high_u8(q2u8);
+        d6u8 = vget_low_u8(q3u8);
+        d7u8 = vget_high_u8(q3u8);
+
+        q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
+        q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
+        q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
+        q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
+        q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
+        q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
+        q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
+        q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
+
+        d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
+        d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
+        q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
+        q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
+        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
+        q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
+        q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
+        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
+        q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
+        q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
+        d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
+        d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
+        q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
+        q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
+        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vget_low_s64(q0s64);
+    d1s64 = vget_high_s64(q0s64);
+    d2s64 = vget_low_s64(q1s64);
+    d3s64 = vget_high_s64(q1s64);
+    d0s64 = vadd_s64(d0s64, d1s64);
+    d1s64 = vadd_s64(d2s64, d3s64);
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_v_neon(
+        const unsigned char *src_ptr,
+        int  source_stride,
+        const unsigned char *ref_ptr,
+        int  recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64, d2s64, d3s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
+    uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
+        q2u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q4u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q6u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q15u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+
+        q1u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q5u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q7u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q0u8 = vrhaddq_u8(q0u8, q2u8);
+        q2u8 = vrhaddq_u8(q2u8, q4u8);
+        q4u8 = vrhaddq_u8(q4u8, q6u8);
+        q6u8 = vrhaddq_u8(q6u8, q15u8);
+
+        d0u8  = vget_low_u8(q0u8);
+        d1u8  = vget_high_u8(q0u8);
+        d4u8  = vget_low_u8(q2u8);
+        d5u8  = vget_high_u8(q2u8);
+        d8u8  = vget_low_u8(q4u8);
+        d9u8  = vget_high_u8(q4u8);
+        d12u8 = vget_low_u8(q6u8);
+        d13u8 = vget_high_u8(q6u8);
+
+        q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
+        q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
+        q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
+        q0u16  = vsubl_u8(d8u8, vget_low_u8(q5u8));
+        q1u16  = vsubl_u8(d9u8, vget_high_u8(q5u8));
+        q2u16  = vsubl_u8(d12u8, vget_low_u8(q7u8));
+        q3u16  = vsubl_u8(d13u8, vget_high_u8(q7u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+
+        q0u8 = q15u8;
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vget_low_s64(q0s64);
+    d1s64 = vget_high_s64(q0s64);
+    d2s64 = vget_low_s64(q1s64);
+    d3s64 = vget_high_s64(q1s64);
+    d0s64 = vadd_s64(d0s64, d1s64);
+    d1s64 = vadd_s64(d2s64, d3s64);
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_hv_neon(
+        const unsigned char *src_ptr,
+        int  source_stride,
+        const unsigned char *ref_ptr,
+        int  recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
+    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64, d2s64, d3s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
+    int32x4_t q13s32, q14s32, q15s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q13s32 = vdupq_n_s32(0);
+    q14s32 = vdupq_n_s32(0);
+    q15s32 = vdupq_n_s32(0);
+
+    q0u8 = vld1q_u8(src_ptr);
+    q1u8 = vld1q_u8(src_ptr + 16);
+    src_ptr += source_stride;
+    q1u8 = vextq_u8(q0u8, q1u8, 1);
+    q0u8 = vrhaddq_u8(q0u8, q1u8);
+    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
+        q2u8 = vld1q_u8(src_ptr);
+        q3u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q4u8 = vld1q_u8(src_ptr);
+        q5u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q6u8 = vld1q_u8(src_ptr);
+        q7u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+        q8u8 = vld1q_u8(src_ptr);
+        q9u8 = vld1q_u8(src_ptr + 16);
+        src_ptr += source_stride;
+
+        q3u8 = vextq_u8(q2u8, q3u8, 1);
+        q5u8 = vextq_u8(q4u8, q5u8, 1);
+        q7u8 = vextq_u8(q6u8, q7u8, 1);
+        q9u8 = vextq_u8(q8u8, q9u8, 1);
+
+        q1u8 = vrhaddq_u8(q2u8, q3u8);
+        q2u8 = vrhaddq_u8(q4u8, q5u8);
+        q3u8 = vrhaddq_u8(q6u8, q7u8);
+        q4u8 = vrhaddq_u8(q8u8, q9u8);
+        q0u8 = vrhaddq_u8(q0u8, q1u8);
+        q1u8 = vrhaddq_u8(q1u8, q2u8);
+        q2u8 = vrhaddq_u8(q2u8, q3u8);
+        q3u8 = vrhaddq_u8(q3u8, q4u8);
+
+        q5u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q6u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q7u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q8u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        d0u8 = vget_low_u8(q0u8);
+        d1u8 = vget_high_u8(q0u8);
+        d2u8 = vget_low_u8(q1u8);
+        d3u8 = vget_high_u8(q1u8);
+        d4u8 = vget_low_u8(q2u8);
+        d5u8 = vget_high_u8(q2u8);
+        d6u8 = vget_low_u8(q3u8);
+        d7u8 = vget_high_u8(q3u8);
+
+        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
+        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
+        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
+        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
+        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
+        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
+        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
+        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));
+
+        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
+        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
+        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
+
+        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
+        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
+        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
+        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
+        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
+        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
+        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
+
+        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
+        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
+        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
+
+        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
+        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
+        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
+
+        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
+        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
+        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
+
+        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
+        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
+        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
+
+        q0u8 = q4u8;
+    }
+
+    q15s32 = vaddq_s32(q14s32, q15s32);
+    q0s64 = vpaddlq_s32(q13s32);
+    q1s64 = vpaddlq_s32(q15s32);
+
+    d0s64 = vget_low_s64(q0s64);
+    d1s64 = vget_high_s64(q0s64);
+    d2s64 = vget_low_s64(q1s64);
+    d3s64 = vget_high_s64(q1s64);
+    d0s64 = vadd_s64(d0s64, d1s64);
+    d1s64 = vadd_s64(d2s64, d3s64);
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+enum { kWidth8 = 8 };
+enum { kHeight8 = 8 };
+enum { kHeight8PlusOne = 9 };
+enum { kPixelStepOne = 1 };
+enum { kAlign16 = 16 };
+
+#define FILTER_BITS 7
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride,
+                                     unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint16_t *vpx_filter) {
+  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vp8_sub_pixel_variance8x8_neon(
+        const unsigned char *src,
+        int src_stride,
+        int xoffset,
+        int yoffset,
+        const unsigned char *dst,
+        int dst_stride,
+        unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+  if (xoffset == 0) {
+    var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
+                              kWidth8, bilinear_taps_coeff[yoffset]);
+  } else if (yoffset == 0) {
+    var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
+                              kHeight8PlusOne, kWidth8,
+                              bilinear_taps_coeff[xoffset]);
+  } else {
+    var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
+                              kHeight8PlusOne, kWidth8,
+                              bilinear_taps_coeff[xoffset]);
+    var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
+                              kWidth8, bilinear_taps_coeff[yoffset]);
+  }
+  return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+}
+
diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
deleted file mode 100644
index e55a33c..0000000
--- a/vp8/common/arm/reconintra_arm.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_NEON_ASM
-extern void vp8_build_intra_predictors_mby_neon_func(
-    unsigned char *y_buffer,
-    unsigned char *ypred_ptr,
-    int y_stride,
-    int mode,
-    int Up,
-    int Left);
-
-void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
-{
-    unsigned char *y_buffer = x->dst.y_buffer;
-    unsigned char *ypred_ptr = x->predictor;
-    int y_stride = x->dst.y_stride;
-    int mode = x->mode_info_context->mbmi.mode;
-    int Up = x->up_available;
-    int Left = x->left_available;
-
-    vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-extern void vp8_build_intra_predictors_mby_s_neon_func(
-    unsigned char *y_buffer,
-    unsigned char *ypred_ptr,
-    int y_stride,
-    int mode,
-    int Up,
-    int Left);
-
-void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
-{
-    unsigned char *y_buffer = x->dst.y_buffer;
-    unsigned char *ypred_ptr = x->predictor;
-    int y_stride = x->dst.y_stride;
-    int mode = x->mode_info_context->mbmi.mode;
-    int Up = x->up_available;
-    int Left = x->left_available;
-
-    vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-#endif
diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index e3f7083..467a509 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@
 #endif /* HAVE_MEDIA */
 
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON
 
 extern unsigned int vp8_sub_pixel_variance16x16_neon_func
 (
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index a46fbfb..d48c4fe 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,7 +108,8 @@
          * For temporal denoiser: noise_sensitivity = 0 means off,
          * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
          * noise_sensitivity = 2 means temporal denoiser on for all channels.
-         * noise_sensitivity >= 3 means aggressive denoising mode.
+         * noise_sensitivity = 3 means aggressive denoising mode.
+         * noise_sensitivity >= 4 means adaptive denoising mode.
          * Temporal denoiser is enabled via the configuration option:
          * CONFIG_TEMPORAL_DENOISING.
          * For spatial denoiser: noise_sensitivity controls the amount of
@@ -223,7 +224,7 @@
         int arnr_strength;
         int arnr_type;
 
-        struct vpx_fixed_buf        two_pass_stats_in;
+        vpx_fixed_buf_t        two_pass_stats_in;
         struct vpx_codec_pkt_list  *output_pkt_list;
 
         vp8e_tuning tuning;
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index fd9afd2..0070c28 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -38,15 +38,13 @@
 $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
 $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
-$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
 $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
-$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 #
@@ -58,9 +56,8 @@
 $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
 $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
-$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
 $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -69,19 +66,18 @@
 $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
 $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
-$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
 $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
-$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
@@ -92,12 +88,12 @@
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
-$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
@@ -153,11 +149,10 @@
 $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
 
 add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
-#TODO: fix assembly for neon
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/;
 
 add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/;
 
 add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
 specialize qw/vp8_intra4x4_predict media/;
@@ -294,22 +289,19 @@
 $vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
 $vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
 $vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-$vp8_variance_halfpixvar16x16_h_neon_asm=vp8_variance_halfpixvar16x16_h_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
 $vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
 $vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-$vp8_variance_halfpixvar16x16_v_neon_asm=vp8_variance_halfpixvar16x16_v_neon;
 
 add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
 $vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
 $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-$vp8_variance_halfpixvar16x16_hv_neon_asm=vp8_variance_halfpixvar16x16_hv_neon;
 
 #
 # Single block SAD
@@ -446,19 +438,16 @@
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
 $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
-$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
 $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
-$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
+specialize qw/vp8_short_walsh4x4 sse2 media neon/;
 $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
-$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 
 #
 # Quantizer
@@ -503,19 +492,16 @@
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
 add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_b mmx sse2 media neon/;
 $vp8_subtract_b_media=vp8_subtract_b_armv6;
-$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
 
 add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mby mmx sse2 media neon/;
 $vp8_subtract_mby_media=vp8_subtract_mby_armv6;
-$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
 
 add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
 $vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
-$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
 
 #
 # Motion search
@@ -541,13 +527,6 @@
 }
 
 #
-# Pick Loopfilter
-#
-add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
-$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
-
-#
 # Denoiser filter
 #
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
@@ -555,7 +534,6 @@
     specialize qw/vp8_denoiser_filter sse2 neon/;
     add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
     specialize qw/vp8_denoiser_filter_uv sse2 neon/;
-
 }
 
 # End of encoder only functions
diff --git a/vp8/encoder/arm/neon/picklpf_arm.c b/vp8/encoder/arm/neon/picklpf_arm.c
deleted file mode 100644
index ec8071e..0000000
--- a/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/loopfilter.h"
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
-                                    unsigned char *src_ptr,
-                                    int sz);
-
-
-void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                      YV12_BUFFER_CONFIG *dst_ybc)
-{
-    unsigned char *src_y, *dst_y;
-    int yheight;
-    int ystride;
-    int yoffset;
-    int linestocopy;
-
-    yheight  = src_ybc->y_height;
-    ystride  = src_ybc->y_stride;
-
-    /* number of MB rows to use in partial filtering */
-    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
-    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
-
-    /* Copy extra 4 so that full filter context is available if filtering done
-     * on the copied partial frame and not original. Partial filter does mb
-     * filtering for top row also, which can modify3 pixels above.
-     */
-    linestocopy += 4;
-    /* partial image starts at ~middle of frame (macroblock border) */
-    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
-    src_y = src_ybc->y_buffer + yoffset;
-    dst_y = dst_ybc->y_buffer + yoffset;
-
-    vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
-}
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
deleted file mode 100644
index 5ea8dd8..0000000
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_fdct4x4_neon|
-    EXPORT  |vp8_short_fdct8x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
-    ALIGN 16    ; enable use of @128 bit aligned loads
-coeff
-    DCW      5352,  5352,  5352, 5352
-    DCW      2217,  2217,  2217, 2217
-    DCD     14500, 14500, 14500, 14500
-    DCD      7500,  7500,  7500, 7500
-    DCD     12000, 12000, 12000, 12000
-    DCD     51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
-    ; Part one
-    vld1.16         {d0}, [r0@64], r2
-    adr             r12, coeff
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {d2}, [r0@64], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {d3}, [r0@64], r2
-
-    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
-    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
-    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
-    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
-    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
-
-    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
-    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
-    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-
-    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vmov.s16        d26, #7
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
-    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
-    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
-    vadd.s16        d4, d4, d26     ; a1 + 7
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
-    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
-
-    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
-    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
-
-    vceq.s16        d4, d7, #0
-
-    vshr.s16        d0, d0, #4
-    vshr.s16        d2, d2, #4
-
-    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
-
-    vmvn            d4, d4
-    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
-    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
-    ; Part one
-
-    vld1.16         {q0}, [r0@128], r2
-    adr             r12, coeff
-    vld1.16         {q1}, [r0@128], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {q2}, [r0@128], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {q3}, [r0@128], r2
-
-    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
-    vtrn.32         q0, q2          ; [A0|B0]
-    vtrn.32         q1, q3          ; [A1|B1]
-    vtrn.16         q0, q1          ; [A2|B2]
-    vtrn.16         q2, q3          ; [A3|B3]
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
-    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
-    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q11, q11, #3    ; a1 << 3
-    vshl.s16        q12, q12, #3    ; b1 << 3
-    vshl.s16        q13, q13, #3    ; c1 << 3
-    vshl.s16        q14, q14, #3    ; d1 << 3
-
-    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
-    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
-
-    vmov.s16        q11, q9         ; 14500
-    vmov.s16        q12, q10        ; 7500
-
-    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
-    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
-    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
-    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
-
-    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
-    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
-    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
-
-    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
-    vtrn.32         q0, q2          ; q0=[A0 | B0]
-    vtrn.32         q1, q3          ; q1=[A4 | B4]
-    vtrn.16         q0, q1          ; q2=[A8 | B8]
-    vtrn.16         q2, q3          ; q3=[A12|B12]
-
-    vmov.s16        q15, #7
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
-    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
-    vadd.s16        q11, q11, q15   ; a1 + 7
-    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
-
-    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
-    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
-
-    vmov.s16        q11, q9         ; 12000
-    vmov.s16        q12, q10        ; 51000
-
-    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
-    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
-
-
-    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
-    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
-    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
-    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
-
-    vceq.s16        q14, q14, #0
-
-    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
-    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
-
-    vmvn            q14, q14
-
-    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
-
-    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
-
-    vst1.16         {q0, q1}, [r1@128]! ; block A
-    vst1.16         {q2, q3}, [r1@128]! ; block B
-
-    bx              lr
-
-    ENDP
-
-    END
-
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.c b/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000..391e5f9
--- /dev/null
+++ b/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+    uint16x4_t d4u16;
+    int16x8_t q0s16, q1s16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+    q11s32 = vdupq_n_s32(12000);
+    q12s32 = vdupq_n_s32(51000);
+
+    // Part one
+    pitch >>= 1;
+    d0s16 = vld1_s16(input);
+    input += pitch;
+    d1s16 = vld1_s16(input);
+    input += pitch;
+    d2s16 = vld1_s16(input);
+    input += pitch;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d4s16 = vshl_n_s16(d4s16, 3);
+    d5s16 = vshl_n_s16(d5s16, 3);
+    d6s16 = vshl_n_s16(d6s16, 3);
+    d7s16 = vshl_n_s16(d7s16, 3);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+    q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 12);
+    d3s16 = vshrn_n_s32(q10s32, 12);
+
+    // Part two
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d26s16 = vdup_n_s16(7);
+    d4s16 = vadd_s16(d4s16, d26s16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+    dEmptys16 = vdup_n_s16(0);
+    d4u16 = vceq_s16(d7s16, dEmptys16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+
+    q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+    d4u16 = vmvn_u16(d4u16);
+    d1s16 = vshrn_n_s32(q11s32, 16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+    d3s16 = vshrn_n_s32(q12s32, 16);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+
+void vp8_short_fdct8x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+    uint16x4_t d28u16, d29u16;
+    uint16x8_t q14u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x8x2_t v2tmp0, v2tmp1;
+    int32x4x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+
+    // Part one
+    pitch >>= 1;
+    q0s16 = vld1q_s16(input);
+    input += pitch;
+    q1s16 = vld1q_s16(input);
+    input += pitch;
+    q2s16 = vld1q_s16(input);
+    input += pitch;
+    q3s16 = vld1q_s16(input);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q11s16 = vshlq_n_s16(q11s16, 3);
+    q12s16 = vshlq_n_s16(q12s16, 3);
+    q13s16 = vshlq_n_s16(q13s16, 3);
+    q14s16 = vshlq_n_s16(q14s16, 3);
+
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q2s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d2s16 = vshrn_n_s32(q9s32, 12);
+    d6s16 = vshrn_n_s32(q10s32, 12);
+    d3s16 = vshrn_n_s32(q11s32, 12);
+    d7s16 = vshrn_n_s32(q12s32, 12);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    // Part two
+    q9s32 = vdupq_n_s32(12000);
+    q10s32 = vdupq_n_s32(51000);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q15s16 = vdupq_n_s16(7);
+    q11s16 = vaddq_s16(q11s16, q15s16);
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q1s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d2s16 = vget_low_s16(q1s16);
+    d3s16 = vget_high_s16(q1s16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d4s16 = vshr_n_s16(d1s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+    d6s16 = vshr_n_s16(d3s16, 4);
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 16);
+    d3s16 = vshrn_n_s32(q10s32, 16);
+    d5s16 = vshrn_n_s32(q11s32, 16);
+    d7s16 = vshrn_n_s32(q12s32, 16);
+
+    qEmptys16 = vdupq_n_s16(0);
+    q14u16 = vceqq_s16(q14s16, qEmptys16);
+    q14u16 = vmvnq_u16(q14u16);
+
+    d28u16 = vget_low_u16(q14u16);
+    d29u16 = vget_high_u16(q14u16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+    d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    vst1q_s16(output + 16, q2s16);
+    vst1q_s16(output + 24, q3s16);
+    return;
+}
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
deleted file mode 100644
index 840cb33..0000000
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null
@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_subtract_b_neon|
-    EXPORT |vp8_subtract_mby_neon|
-    EXPORT |vp8_subtract_mbuv_neon|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
-    stmfd   sp!, {r4-r7}
-
-    ldr     r3, [r0, #vp8_block_base_src]
-    ldr     r4, [r0, #vp8_block_src]
-    ldr     r5, [r0, #vp8_block_src_diff]
-    ldr     r3, [r3]
-    ldr     r6, [r0, #vp8_block_src_stride]
-    add     r3, r3, r4                      ; src = *base_src + src
-    ldr     r7, [r1, #vp8_blockd_predictor]
-
-    vld1.8          {d0}, [r3], r6          ;load src
-    vld1.8          {d1}, [r7], r2          ;load pred
-    vld1.8          {d2}, [r3], r6
-    vld1.8          {d3}, [r7], r2
-    vld1.8          {d4}, [r3], r6
-    vld1.8          {d5}, [r7], r2
-    vld1.8          {d6}, [r3], r6
-    vld1.8          {d7}, [r7], r2
-
-    vsubl.u8        q10, d0, d1
-    vsubl.u8        q11, d2, d3
-    vsubl.u8        q12, d4, d5
-    vsubl.u8        q13, d6, d7
-
-    mov             r2, r2, lsl #1
-
-    vst1.16         {d20}, [r5], r2         ;store diff
-    vst1.16         {d22}, [r5], r2
-    vst1.16         {d24}, [r5], r2
-    vst1.16         {d26}, [r5], r2
-
-    ldmfd   sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
-;                           unsigned char *pred, int pred_stride)
-|vp8_subtract_mby_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    mov             r12, #4
-    ldr             r4, [sp, #80]           ; pred_stride
-    mov             r6, #32                 ; "diff" stride x2
-    add             r5, r0, #16             ; second diff pointer
-
-subtract_mby_loop
-    vld1.8          {q0}, [r1], r2          ;load src
-    vld1.8          {q1}, [r3], r4          ;load pred
-    vld1.8          {q2}, [r1], r2
-    vld1.8          {q3}, [r3], r4
-    vld1.8          {q4}, [r1], r2
-    vld1.8          {q5}, [r3], r4
-    vld1.8          {q6}, [r1], r2
-    vld1.8          {q7}, [r3], r4
-
-    vsubl.u8        q8, d0, d2
-    vsubl.u8        q9, d1, d3
-    vsubl.u8        q10, d4, d6
-    vsubl.u8        q11, d5, d7
-    vsubl.u8        q12, d8, d10
-    vsubl.u8        q13, d9, d11
-    vsubl.u8        q14, d12, d14
-    vsubl.u8        q15, d13, d15
-
-    vst1.16         {q8}, [r0], r6          ;store diff
-    vst1.16         {q9}, [r5], r6
-    vst1.16         {q10}, [r0], r6
-    vst1.16         {q11}, [r5], r6
-    vst1.16         {q12}, [r0], r6
-    vst1.16         {q13}, [r5], r6
-    vst1.16         {q14}, [r0], r6
-    vst1.16         {q15}, [r5], r6
-
-    subs            r12, r12, #1
-    bne             subtract_mby_loop
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-    ENDP
-
-;=================================
-;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-
-|vp8_subtract_mbuv_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    ldr             r4, [sp, #80]       ; upred
-    ldr             r5, [sp, #84]       ; vpred
-    ldr             r6, [sp, #88]       ; pred_stride
-    add             r0, r0, #512        ; short *udiff = diff + 256;
-    mov             r12, #32            ; "diff" stride x2
-    add             r7, r0, #16         ; second diff pointer
-
-;u
-    vld1.8          {d0}, [r1], r3      ;load usrc
-    vld1.8          {d1}, [r4], r6      ;load upred
-    vld1.8          {d2}, [r1], r3
-    vld1.8          {d3}, [r4], r6
-    vld1.8          {d4}, [r1], r3
-    vld1.8          {d5}, [r4], r6
-    vld1.8          {d6}, [r1], r3
-    vld1.8          {d7}, [r4], r6
-    vld1.8          {d8}, [r1], r3
-    vld1.8          {d9}, [r4], r6
-    vld1.8          {d10}, [r1], r3
-    vld1.8          {d11}, [r4], r6
-    vld1.8          {d12}, [r1], r3
-    vld1.8          {d13}, [r4], r6
-    vld1.8          {d14}, [r1], r3
-    vld1.8          {d15}, [r4], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-;v
-    vld1.8          {d0}, [r2], r3      ;load vsrc
-    vld1.8          {d1}, [r5], r6      ;load vpred
-    vld1.8          {d2}, [r2], r3
-    vld1.8          {d3}, [r5], r6
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d5}, [r5], r6
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d7}, [r5], r6
-    vld1.8          {d8}, [r2], r3
-    vld1.8          {d9}, [r5], r6
-    vld1.8          {d10}, [r2], r3
-    vld1.8          {d11}, [r5], r6
-    vld1.8          {d12}, [r2], r3
-    vld1.8          {d13}, [r5], r6
-    vld1.8          {d14}, [r2], r3
-    vld1.8          {d15}, [r5], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c
new file mode 100644
index 0000000..d3ab7b1
--- /dev/null
+++ b/vp8/encoder/arm/neon/subtract_neon.c
@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+void vp8_subtract_b_neon(
+        BLOCK *be,
+        BLOCKD *bd,
+        int pitch) {
+    unsigned char *src_ptr, *predictor;
+    int src_stride;
+    int16_t *src_diff;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint16x8_t q10u16, q11u16, q12u16, q13u16;
+
+    src_ptr = *be->base_src + be->src;
+    src_stride = be->src_stride;
+    predictor = bd->predictor;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d4u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d6u8 = vld1_u8(src_ptr);
+
+    d1u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d3u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d5u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d7u8 = vld1_u8(predictor);
+
+    q10u16 = vsubl_u8(d0u8, d1u8);
+    q11u16 = vsubl_u8(d2u8, d3u8);
+    q12u16 = vsubl_u8(d4u8, d5u8);
+    q13u16 = vsubl_u8(d6u8, d7u8);
+
+    src_diff = be->src_diff;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
+    return;
+}
+
+void vp8_subtract_mby_neon(
+        int16_t *diff,
+        unsigned char *src,
+        int src_stride,
+        unsigned char *pred,
+        int pred_stride) {
+    int i;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    for (i = 0; i < 8; i++) {  // subtract_mby_loop
+        q0u8 = vld1q_u8(src);
+        src += src_stride;
+        q2u8 = vld1q_u8(src);
+        src += src_stride;
+        q1u8 = vld1q_u8(pred);
+        pred += pred_stride;
+        q3u8 = vld1q_u8(pred);
+        pred += pred_stride;
+
+        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
+        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
+        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
+        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
+
+        vst1q_u16((uint16_t *)diff, q8u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q9u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q10u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q11u16);
+        diff += 8;
+    }
+    return;
+}
+
+void vp8_subtract_mbuv_neon(
+        int16_t *diff,
+        unsigned char *usrc,
+        unsigned char *vsrc,
+        int src_stride,
+        unsigned char *upred,
+        unsigned char *vpred,
+        int pred_stride) {
+    int i, j;
+    unsigned char *src_ptr, *pred_ptr;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    diff += 256;
+    for (i = 0; i < 2; i++) {
+        if (i == 0) {
+            src_ptr = usrc;
+            pred_ptr = upred;
+        } else if (i == 1) {
+            src_ptr = vsrc;
+            pred_ptr = vpred;
+        }
+
+        for (j = 0; j < 2; j++) {
+            d0u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d1u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d2u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d3u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d4u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d5u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d6u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d7u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+
+            q8u16  = vsubl_u8(d0u8, d1u8);
+            q9u16  = vsubl_u8(d2u8, d3u8);
+            q10u16 = vsubl_u8(d4u8, d5u8);
+            q11u16 = vsubl_u8(d6u8, d7u8);
+
+            vst1q_u16((uint16_t *)diff, q8u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q9u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q10u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q11u16);
+            diff += 8;
+        }
+    }
+    return;
+}
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
deleted file mode 100644
index d219e2d..0000000
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_memcpy_partial_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;this is not a full memcpy function!!!
-;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
-;                             int sz);
-|vp8_memcpy_partial_neon| PROC
-    vpush               {d8-d15}
-    ;pld                [r1]                        ;preload pred data
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
-    vld1.8          {q0, q1}, [r1]!                 ;load src data
-    subs            r12, r12, #1
-    vld1.8          {q2, q3}, [r1]!
-    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
-    vld1.8          {q4, q5}, [r1]!
-    vst1.8          {q2, q3}, [r0]!
-    vld1.8          {q6, q7}, [r1]!
-    vst1.8          {q4, q5}, [r0]!
-    vld1.8          {q8, q9}, [r1]!
-    vst1.8          {q6, q7}, [r0]!
-    vld1.8          {q10, q11}, [r1]!
-    vst1.8          {q8, q9}, [r0]!
-    vld1.8          {q12, q13}, [r1]!
-    vst1.8          {q10, q11}, [r0]!
-    vld1.8          {q14, q15}, [r1]!
-    vst1.8          {q12, q13}, [r0]!
-    vst1.8          {q14, q15}, [r0]!
-
-    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    bne             memcpy_neon_loop
-
-    ands            r3, r2, #0xff                   ;extra copy
-    beq             done_copy_neon_loop
-
-extra_copy_neon_loop
-    vld1.8          {q0}, [r1]!                 ;load src data
-    subs            r3, r3, #16
-    vst1.8          {q0}, [r0]!
-    bne             extra_copy_neon_loop
-
-done_copy_neon_loop
-    vpop            {d8-d15}
-    bx              lr
-    ENDP
-
-    END
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
deleted file mode 100644
index 22266297..0000000
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null
@@ -1,103 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_walsh4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0   short *input,
-; r1   short *output,
-; r2   int pitch
-|vp8_short_walsh4x4_neon| PROC
-
-    vld1.16         {d0}, [r0@64], r2   ; load input
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {d2}, [r0@64], r2
-    vld1.16         {d3}, [r0@64]
-
-    ;First for-loop
-    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-
-    vmov.s32        q15, #3             ; add 3 to all values
-
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
-    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
-    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
-    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
-
-    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
-    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
-    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
-    vceq.s16        d16, d4, #0         ; a1 == 0
-    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
-
-    vadd.s16        d0, d4, d5          ; a1 + d1
-    vmvn            d16, d16            ; a1 != 0
-    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
-    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
-    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
-    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
-
-    ;Second for-loop
-    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d1, d3
-    vtrn.32         d0, d2
-    vtrn.16         d2, d3
-    vtrn.16         d0, d1
-
-    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
-    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
-    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
-    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
-
-    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
-    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
-    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
-    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
-
-    vclt.s32        q8, q0, #0
-    vclt.s32        q9, q1, #0
-    vclt.s32        q10, q2, #0
-    vclt.s32        q11, q3, #0
-
-    ; subtract -1 (or 0)
-    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
-    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
-    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
-    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
-
-    vadd.s32        q8, q0, q15         ; a2 + 3
-    vadd.s32        q9, q1, q15         ; b2 + 3
-    vadd.s32        q10, q2, q15        ; c2 + 3
-    vadd.s32        q11, q3, q15        ; d2 + 3
-
-    ; vrshrn? would add 1 << 3-1 = 2
-    vshrn.s32       d0, q8, #3
-    vshrn.s32       d1, q9, #3
-    vshrn.s32       d2, q10, #3
-    vshrn.s32       d3, q11, #3
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 0000000..d6b67f8
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    uint16x4_t d16u16;
+    int16x8_t q0s16, q1s16;
+    int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q15s32;
+    uint32x4_t q8u32, q9u32, q10u32, q11u32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    dEmptys16 = vdup_n_s16(0);
+    qEmptys32 = vdupq_n_s32(0);
+    q15s32 = vdupq_n_s32(3);
+
+    d0s16 = vld1_s16(input);
+    input += pitch/2;
+    d1s16 = vld1_s16(input);
+    input += pitch/2;
+    d2s16 = vld1_s16(input);
+    input += pitch/2;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+    d4s16 = vshl_n_s16(d4s16, 2);
+    d5s16 = vshl_n_s16(d5s16, 2);
+    d6s16 = vshl_n_s16(d6s16, 2);
+    d7s16 = vshl_n_s16(d7s16, 2);
+
+    d16u16 = vceq_s16(d4s16, dEmptys16);
+    d16u16 = vmvn_u16(d16u16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d3s16 = vsub_s16(d4s16, d5s16);
+    d1s16 = vadd_s16(d7s16, d6s16);
+    d2s16 = vsub_s16(d7s16, d6s16);
+
+    d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+    // Second for-loop
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp2.val[1]));  // d3
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp2.val[0]));  // d1
+
+    q8s32  = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+    q9s32  = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+    q0s32 = vaddq_s32(q8s32, q9s32);
+    q1s32 = vaddq_s32(q11s32, q10s32);
+    q2s32 = vsubq_s32(q11s32, q10s32);
+    q3s32 = vsubq_s32(q8s32, q9s32);
+
+    q8u32  = vcltq_s32(q0s32, qEmptys32);
+    q9u32  = vcltq_s32(q1s32, qEmptys32);
+    q10u32 = vcltq_s32(q2s32, qEmptys32);
+    q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+    q8s32  = vreinterpretq_s32_u32(q8u32);
+    q9s32  = vreinterpretq_s32_u32(q9u32);
+    q10s32 = vreinterpretq_s32_u32(q10u32);
+    q11s32 = vreinterpretq_s32_u32(q11u32);
+
+    q0s32 = vsubq_s32(q0s32, q8s32);
+    q1s32 = vsubq_s32(q1s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q3s32 = vsubq_s32(q3s32, q11s32);
+
+    q8s32  = vaddq_s32(q0s32, q15s32);
+    q9s32  = vaddq_s32(q1s32, q15s32);
+    q10s32 = vaddq_s32(q2s32, q15s32);
+    q11s32 = vaddq_s32(q3s32, q15s32);
+
+    d0s16 = vshrn_n_s32(q8s32, 3);
+    d1s16 = vshrn_n_s32(q9s32, 3);
+    d2s16 = vshrn_n_s32(q10s32, 3);
+    d3s16 = vshrn_n_s32(q11s32, 3);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 75401fc..2f33d4a 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -68,6 +68,10 @@
     int adj_val[3] = {3, 4, 6};
     int shift_inc1 = 0;
     int shift_inc2 = 1;
+    int col_sum[16] = {0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
     /* If motion_magnitude is small, making the denoiser more aggressive by
      * increasing the adjustment for each level. Add another increment for
      * blocks that are labeled for increase denoising. */
@@ -98,11 +102,11 @@
             if (absdiff <= 3 + shift_inc1)
             {
                 running_avg_y[c] = mc_running_avg_y[c];
-                sum_diff += diff;
+                col_sum[c] += diff;
             }
             else
             {
-                if (absdiff >= 4 && absdiff <= 7)
+                if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
                     adjustment = adj_val[0];
                 else if (absdiff >= 8 && absdiff <= 15)
                     adjustment = adj_val[1];
@@ -116,7 +120,7 @@
                     else
                         running_avg_y[c] = sig[c] + adjustment;
 
-                    sum_diff += adjustment;
+                    col_sum[c] += adjustment;
                 }
                 else
                 {
@@ -125,7 +129,7 @@
                     else
                         running_avg_y[c] = sig[c] - adjustment;
 
-                    sum_diff -= adjustment;
+                    col_sum[c] -= adjustment;
                 }
             }
         }
@@ -136,6 +140,23 @@
         running_avg_y += avg_y_stride;
     }
 
+    for (c = 0; c < 16; ++c) {
+      // Below we clip the value in the same way which SSE code use.
+      // When adopting aggressive denoiser, the adj_val for each pixel
+      // could be at most 8 (this is current max adjustment of the map).
+      // In SSE code, we calculate the sum of adj_val for
+      // the columns, so the sum could be upto 128(16 rows). However,
+      // the range of the value is -128 ~ 127 in SSE code, that's why
+      // we do this change in C code.
+      // We don't do this for UV denoiser, since there are only 8 rows,
+      // and max adjustments <= 8, so the sum of the columns will not
+      // exceed 64.
+      if (col_sum[c] >= 128) {
+        col_sum[c] = 127;
+      }
+      sum_diff += col_sum[c];
+    }
+
     sum_diff_thresh= SUM_DIFF_THRESHOLD;
     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
     if (abs(sum_diff) > sum_diff_thresh) {
@@ -166,14 +187,14 @@
                 running_avg_y[c] = 0;
               else
                 running_avg_y[c] = running_avg_y[c] - adjustment;
-              sum_diff -= adjustment;
+              col_sum[c] -= adjustment;
             } else if (diff < 0) {
               // Bring denoised signal up.
               if (running_avg_y[c] + adjustment > 255)
                 running_avg_y[c] = 255;
               else
                 running_avg_y[c] = running_avg_y[c] + adjustment;
-              sum_diff += adjustment;
+              col_sum[c] += adjustment;
             }
           }
           // TODO(marpan): Check here if abs(sum_diff) has gone below the
@@ -182,6 +203,15 @@
           mc_running_avg_y += mc_avg_y_stride;
           running_avg_y += avg_y_stride;
         }
+
+        sum_diff = 0;
+        for (c = 0; c < 16; ++c) {
+          if (col_sum[c] >= 128) {
+            col_sum[c] = 127;
+          }
+          sum_diff += col_sum[c];
+        }
+
         if (abs(sum_diff) > sum_diff_thresh)
           return COPY_BLOCK;
       } else {
@@ -341,8 +371,10 @@
     denoiser->denoiser_mode = kDenoiserOnYOnly;
   } else if (mode == 2) {
     denoiser->denoiser_mode = kDenoiserOnYUV;
-  } else {
+  } else if (mode == 3) {
     denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
+  } else {
+    denoiser->denoiser_mode = kDenoiserOnAdaptive;
   }
   if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
     denoiser->denoise_pars.scale_sse_thresh = 1;
@@ -397,9 +429,40 @@
     vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
                denoiser->yv12_mc_running_avg.frame_size);
 
+    if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
+                                    height, VP8BORDERINPIXELS) < 0) {
+      vp8_denoiser_free(denoiser);
+      return 1;
+    }
+    vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
+               denoiser->yv12_last_source.frame_size);
+
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
     vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
     vp8_denoiser_set_parameters(denoiser, mode);
+    denoiser->nmse_source_diff = 0;
+    denoiser->nmse_source_diff_count = 0;
+    denoiser->qp_avg = 0;
+    // QP threshold below which we can go up to aggressive mode.
+    denoiser->qp_threshold_up = 80;
+    // QP threshold above which we can go back down to normal mode.
+    // For now keep this second threshold high, so not used currently.
+    denoiser->qp_threshold_down = 128;
+    // Bitrate thresholds and noise metric (nmse) thresholds for switching to
+    // aggressive mode.
+    // TODO(marpan): Adjust thresholds, including effect on resolution.
+    denoiser->bitrate_threshold = 200000;  // (bits/sec).
+    denoiser->threshold_aggressive_mode = 35;
+    if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 500000;
+      denoiser->threshold_aggressive_mode = 100;
+    } else if (width * height > 960 * 540) {
+      denoiser->bitrate_threshold = 800000;
+      denoiser->threshold_aggressive_mode = 150;
+    } else if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 2000000;
+      denoiser->threshold_aggressive_mode = 1400;
+    }
     return 0;
 }
 
@@ -414,6 +477,7 @@
         vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
     }
     vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
     vpx_free(denoiser->denoise_state);
 }
 
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 89832d3..bb0c2ce 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -19,12 +19,12 @@
 #endif
 
 #define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
+#define SUM_DIFF_THRESHOLD_HIGH (600)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
 #define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
-#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
 #define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
 
 enum vp8_denoiser_decision
@@ -43,7 +43,8 @@
   kDenoiserOff,
   kDenoiserOnYOnly,
   kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
 };
 
 typedef struct {
@@ -72,9 +73,18 @@
 {
     YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
     YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
+    YV12_BUFFER_CONFIG yv12_last_source;
     unsigned char* denoise_state;
     int num_mb_cols;
     int denoiser_mode;
+    int threshold_aggressive_mode;
+    int nmse_source_diff;
+    int nmse_source_diff_count;
+    int qp_avg;
+    int qp_threshold_up;
+    int qp_threshold_down;
+    int bitrate_threshold;
     denoise_params denoise_pars;
 } VP8_DENOISER;
 
@@ -83,6 +93,8 @@
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser);
 
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
+
 void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              MACROBLOCK *x,
                              unsigned int best_sse,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7140f2f..791a485 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -615,19 +615,21 @@
         cpi->cyclic_refresh_mode_index = i;
 
 #if CONFIG_TEMPORAL_DENOISING
-        if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
-            Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
-          // Under aggressive denoising mode, use segmentation to turn off loop
-          // filter below some qp thresh. The loop filter is turned off for all
-          // blocks that have been encoded as ZEROMV LAST x frames in a row,
-          // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
-          // This is to avoid "dot" artifacts that can occur from repeated
-          // loop filtering on noisy input source.
-          cpi->cyclic_refresh_q = Q;
-          lf_adjustment = -MAX_LOOP_FILTER;
-          for (i = 0; i < mbs_in_frame; ++i) {
-            seg_map[i] = (cpi->consec_zero_last[i] >
-                          cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+        if (cpi->oxcf.noise_sensitivity > 0) {
+          if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
+              Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+            // Under aggressive denoising, use segmentation to turn off loop
+            // filter below some qp thresh. The filter is turned off for all
+            // blocks that have been encoded as ZEROMV LAST x frames in a row,
+            // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+            // This is to avoid "dot" artifacts that can occur from repeated
+            // loop filtering on noisy input source.
+            cpi->cyclic_refresh_q = Q;
+            lf_adjustment = -MAX_LOOP_FILTER;
+            for (i = 0; i < mbs_in_frame; ++i) {
+              seg_map[i] = (cpi->consec_zero_last[i] >
+                            cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+            }
           }
         }
 #endif
@@ -3150,10 +3152,8 @@
 
         cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
         cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
         cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
     }
     else    /* For non key frames */
     {
@@ -3165,9 +3165,7 @@
             cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
             cm->alt_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
             cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
         }
         else if (cm->copy_buffer_to_arf)
         {
@@ -3181,10 +3179,8 @@
                     yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->lst_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[ALTREF_FRAME] =
                         cpi->current_ref_frames[LAST_FRAME];
-#endif
                 }
             }
             else /* if (cm->copy_buffer_to_arf == 2) */
@@ -3195,10 +3191,8 @@
                     yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->gld_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[ALTREF_FRAME] =
                         cpi->current_ref_frames[GOLDEN_FRAME];
-#endif
                 }
             }
         }
@@ -3211,9 +3205,7 @@
             cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
             cm->gld_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
             cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
-#endif
         }
         else if (cm->copy_buffer_to_gf)
         {
@@ -3227,10 +3219,8 @@
                     yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->lst_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[GOLDEN_FRAME] =
                         cpi->current_ref_frames[LAST_FRAME];
-#endif
                 }
             }
             else /* if (cm->copy_buffer_to_gf == 2) */
@@ -3241,10 +3231,8 @@
                     yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->alt_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
                     cpi->current_ref_frames[GOLDEN_FRAME] =
                         cpi->current_ref_frames[ALTREF_FRAME];
-#endif
                 }
             }
         }
@@ -3256,9 +3244,7 @@
         cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
         cm->lst_fb_idx = cm->new_fb_idx;
 
-#if CONFIG_MULTI_RES_ENCODING
         cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
-#endif
     }
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -3299,12 +3285,133 @@
                         &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
             }
         }
+        if (cpi->oxcf.noise_sensitivity == 4)
+          vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source);
 
     }
 #endif
 
 }
 
+#if CONFIG_TEMPORAL_DENOISING
+static void process_denoiser_mode_change(VP8_COMP *cpi) {
+  const VP8_COMMON *const cm = &cpi->common;
+  int i, j;
+  int total = 0;
+  int num_blocks = 0;
+  // Number of blocks skipped along row/column in computing the
+  // nmse (normalized mean square error) of source.
+  int skip = 2;
+  // Only select blocks for computing nmse that have been encoded
+  // as ZERO LAST min_consec_zero_last frames in a row.
+  // Scale with number of temporal layers.
+  int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers;
+  // Decision is tested for changing the denoising mode every
+  // num_mode_change times this function is called. Note that this
+  // function called every 8 frames, so (8 * num_mode_change) is number
+  // of frames where denoising mode change is tested for switch.
+  int num_mode_change = 15;
+  // Framerate factor, to compensate for larger mse at lower framerates.
+  // Use ref_framerate, which is full source framerate for temporal layers.
+  // TODO(marpan): Adjust this factor.
+  int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100;
+  int tot_num_blocks = cm->mb_rows * cm->mb_cols;
+  int ystride = cpi->Source->y_stride;
+  unsigned char *src = cpi->Source->y_buffer;
+  unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer;
+  static const unsigned char const_source[16] = {
+      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128, 128, 128};
+
+  // Loop through the Y plane, every skip blocks along rows and columns,
+  // summing the normalized mean square error, only for blocks that have
+  // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
+  // a row and have small sum difference between current and previous frame.
+  // Normalization here is by the contrast of the current frame block.
+  for (i = 0; i < cm->Height; i += 16 * skip) {
+    int block_index_row = (i >> 4) * cm->mb_cols;
+    for (j = 0; j < cm->Width; j += 16 * skip) {
+      int index = block_index_row + (j >> 4);
+      if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+        unsigned int sse;
+        const unsigned int mse = vp8_mse16x16(src + j,
+                                              ystride,
+                                              dst + j,
+                                              ystride,
+                                              &sse);
+        const unsigned int var = vp8_variance16x16(src + j,
+                                                   ystride,
+                                                   dst + j,
+                                                   ystride,
+                                                   &sse);
+        // Only consider this block as valid for noise measurement
+        // if the sum_diff average of the current and previous frame
+        // is small (to avoid effects from lighting change).
+        if ((mse - var) < 256) {
+          const unsigned int act = vp8_variance16x16(src + j,
+                                                     ystride,
+                                                     const_source,
+                                                     0,
+                                                     &sse);
+          if (act > 0)
+            total += mse / act;
+          num_blocks++;
+        }
+      }
+    }
+    src += 16 * skip * ystride;
+    dst += 16 * skip * ystride;
+  }
+  total = total * fac_framerate / 100;
+
+  // Only consider this frame as valid sample if we have computed nmse over
+  // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the
+  // application inputs duplicate frames, or contrast is all zero).
+  if (total > 0 &&
+      (num_blocks > (tot_num_blocks >> 4))) {
+    // Update the recursive mean square source_diff.
+    if (cpi->denoiser.nmse_source_diff_count == 0) {
+      // First sample in new interval.
+      cpi->denoiser.nmse_source_diff = total;
+      cpi->denoiser.qp_avg = cm->base_qindex;
+    } else {
+      // For subsequent samples, use average with weight ~1/4 for new sample.
+      cpi->denoiser.nmse_source_diff = (int)((total >> 2) +
+          3 * (cpi->denoiser.nmse_source_diff >> 2));
+      cpi->denoiser.qp_avg = (int)((cm->base_qindex >> 2) +
+          3 * (cpi->denoiser.qp_avg >> 2));
+    }
+    cpi->denoiser.nmse_source_diff_count++;
+  }
+  // Check for changing the denoiser mode, when we have obtained #samples =
+  // num_mode_change. Condition the change also on the bitrate and QP.
+  if (cpi->denoiser.nmse_source_diff_count == num_mode_change) {
+    // Check for going up: from normal to aggressive mode.
+    if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) &&
+        (cpi->denoiser.nmse_source_diff >
+        cpi->denoiser.threshold_aggressive_mode) &&
+        (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
+         cpi->target_bandwidth > cpi->denoiser.bitrate_threshold)) {
+      vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
+    } else {
+      // Check for going down: from aggressive to normal mode.
+      if (((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.nmse_source_diff <
+          cpi->denoiser.threshold_aggressive_mode)) ||
+          ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
+           cpi->target_bandwidth < cpi->denoiser.bitrate_threshold))) {
+        vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+      }
+    }
+    // Reset metric and counter for next interval.
+    cpi->denoiser.nmse_source_diff = 0;
+    cpi->denoiser.qp_avg = 0;
+    cpi->denoiser.nmse_source_diff_count = 0;
+  }
+}
+#endif
+
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
 {
     const FRAME_TYPE frame_type = cm->frame_type;
@@ -3461,6 +3568,12 @@
     {
         /* Key frame from VFW/auto-keyframe/first frame */
         cm->frame_type = KEY_FRAME;
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity == 4) {
+          // For adaptive mode, reset denoiser to normal mode on key frame.
+          vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+        }
+#endif
     }
 
 #if CONFIG_MULTI_RES_ENCODING
@@ -3494,6 +3607,31 @@
     }
 #endif
 
+    // Find the reference frame closest to the current frame.
+    cpi->closest_reference_frame = LAST_FRAME;
+    if (cm->frame_type != KEY_FRAME) {
+      int i;
+      MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
+      if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+        closest_ref = LAST_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) {
+        closest_ref = GOLDEN_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
+        closest_ref = ALTREF_FRAME;
+      }
+      for (i = 1; i <= 3; i++) {
+        vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
+            ((i == 3) ? 4 : i);
+        if (cpi->ref_frame_flags & ref_frame_type) {
+          if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
+            (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+            closest_ref = i;
+          }
+        }
+      }
+      cpi->closest_reference_frame = closest_ref;
+    }
+
     /* Set various flags etc to special state if it is a key frame */
     if (cm->frame_type == KEY_FRAME)
     {
@@ -4420,7 +4558,8 @@
             {
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
                 {
-                    if(tmp->mbmi.mode == ZEROMV)
+                    if (tmp->mbmi.mode == ZEROMV &&
+                       tmp->mbmi.ref_frame == LAST_FRAME)
                         cpi->zeromv_count++;
                     tmp++;
                 }
@@ -4462,6 +4601,21 @@
 
     cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
+#if CONFIG_TEMPORAL_DENOISING
+    // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
+    // of source diff (between current and previous frame), and determine if we
+    // should switch the denoiser mode. Sampling refers to computing the mse for
+    // a sub-sample of the frame (i.e., skip x blocks along row/column), and
+    // only for blocks in that set that have used ZEROMV LAST, along with some
+    // constraint on the sum diff between blocks. This process is called every
+    // ~8 frames, to further reduce complexity.
+    if (cpi->oxcf.noise_sensitivity == 4 &&
+        cpi->frames_since_key % 8 == 0 &&
+        cm->frame_type != KEY_FRAME) {
+      process_denoiser_mode_change(cpi);
+    }
+#endif
+
 #if CONFIG_MULTITHREAD
     if (cpi->b_multi_threaded)
     {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7a8baca..f0424e6 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -684,9 +684,10 @@
     int    mr_low_res_mb_cols;
     /* Indicate if lower-res mv info is available */
     unsigned char  mr_low_res_mv_avail;
+#endif
     /* The frame number of each reference frames */
     unsigned int current_ref_frames[MAX_REF_FRAMES];
-#endif
+    MV_REFERENCE_FRAME closest_reference_frame;
 
     struct rd_costs_struct
     {
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index d0ad721..43f8957 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -487,6 +487,7 @@
     MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
     int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
     int this_rd;
+    int denoise_aggressive = 0;
     /* Exit early and don't compute the distortion if this macroblock
      * is marked inactive. */
     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -505,16 +506,18 @@
 
     this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
 
-    /* Adjust rd to bias to ZEROMV */
-    if(this_mode == ZEROMV)
-    {
-        /* Bias to ZEROMV on LAST_FRAME reference when it is available. */
-        if ((cpi->ref_frame_flags & VP8_LAST_FRAME &
-            cpi->common.refresh_last_frame)
-            && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME)
-            rd_adj = 100;
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      denoise_aggressive =
+        (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+    }
+#endif
 
-        // rd_adj <= 100
+    // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
+    if (this_mode == ZEROMV &&
+        x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
+    {
         this_rd = ((int64_t)this_rd) * rd_adj / 100;
     }
 
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 250d04c..f0c8f28 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -23,8 +23,8 @@
 
 extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc)
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                    YV12_BUFFER_CONFIG *dst_ybc)
 {
     unsigned char *src_y, *dst_y;
     int yheight;
@@ -173,7 +173,7 @@
     /* Get the err using the previous frame's filter value. */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+    yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
     vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
     best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
@@ -184,7 +184,7 @@
     while (filt_val >= min_filter_level)
     {
         /* Apply the loop filter */
-        vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+        yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
         vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
         /* Get the err for filtered frame */
@@ -214,7 +214,7 @@
         while (filt_val < max_filter_level)
         {
             /* Apply the loop filter */
-            vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+            yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
 
             vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 6db031f..9b11c0d 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -155,30 +155,25 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
-# common (neon)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index d515fc0..b1b079c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -14,6 +14,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -39,40 +40,28 @@
 
 };
 
-struct extraconfig_map
-{
-    int                 usage;
-    struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] =
-{
-    {
-        0,
-        {
-            NULL,
+static struct vp8_extracfg default_extracfg = {
+  NULL,
 #if !(CONFIG_REALTIME_ONLY)
-            0,                          /* cpu_used      */
+  0,                          /* cpu_used      */
 #else
-            4,                          /* cpu_used      */
+  4,                          /* cpu_used      */
 #endif
-            0,                          /* enable_auto_alt_ref */
-            0,                          /* noise_sensitivity */
-            0,                          /* Sharpness */
-            0,                          /* static_thresh */
+  0,                          /* enable_auto_alt_ref */
+  0,                          /* noise_sensitivity */
+  0,                          /* Sharpness */
+  0,                          /* static_thresh */
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
-            VP8_EIGHT_TOKENPARTITION,
+  VP8_EIGHT_TOKENPARTITION,
 #else
-            VP8_ONE_TOKENPARTITION,     /* token_partitions */
+  VP8_ONE_TOKENPARTITION,     /* token_partitions */
 #endif
-            0,                          /* arnr_max_frames */
-            3,                          /* arnr_strength */
-            3,                          /* arnr_type*/
-            0,                          /* tuning*/
-            10,                         /* cq_level */
-            0,                          /* rc_max_intra_bitrate_pct */
-        }
-    }
+  0,                          /* arnr_max_frames */
+  3,                          /* arnr_strength */
+  3,                          /* arnr_type*/
+  0,                          /* tuning*/
+  10,                         /* cq_level */
+  0,                          /* rc_max_intra_bitrate_pct */
 };
 
 struct vpx_codec_alg_priv
@@ -631,27 +620,21 @@
                                  vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
 {
     vpx_codec_err_t        res = VPX_CODEC_OK;
-    struct vpx_codec_alg_priv *priv;
-    vpx_codec_enc_cfg_t       *cfg;
-    unsigned int               i;
 
-    struct VP8_COMP *optr;
 
     vp8_rtcd();
 
     if (!ctx->priv)
     {
-        priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+        struct vpx_codec_alg_priv *priv =
+            (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv));
 
         if (!priv)
         {
             return VPX_CODEC_MEM_ERROR;
         }
 
-        ctx->priv = &priv->base;
-        ctx->priv->sz = sizeof(*ctx->priv);
-        ctx->priv->iface = ctx->iface;
-        ctx->priv->alg_priv = priv;
+        ctx->priv = (vpx_codec_priv_t *)priv;
         ctx->priv->init_flags = ctx->init_flags;
 
         if (ctx->config.enc)
@@ -659,21 +642,11 @@
             /* Update the reference to the config structure to an
              * internal copy.
              */
-            ctx->priv->alg_priv->cfg = *ctx->config.enc;
-            ctx->config.enc = &ctx->priv->alg_priv->cfg;
+            priv->cfg = *ctx->config.enc;
+            ctx->config.enc = &priv->cfg;
         }
 
-        cfg =  &ctx->priv->alg_priv->cfg;
-
-        /* Select the extra vp8 configuration table based on the current
-         * usage value. If the current usage value isn't found, use the
-         * values for usage case 0.
-         */
-        for (i = 0;
-             extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-             i++);
-
-        priv->vp8_cfg = extracfg_map[i].cfg;
+        priv->vp8_cfg = default_extracfg;
         priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
 
         priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
@@ -696,17 +669,10 @@
 
         if (!res)
         {
-            set_vp8e_config(&ctx->priv->alg_priv->oxcf,
-                             ctx->priv->alg_priv->cfg,
-                             ctx->priv->alg_priv->vp8_cfg,
-                             mr_cfg);
-
-            optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
-
-            if (!optr)
+            set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
+            priv->cpi = vp8_create_compressor(&priv->oxcf);
+            if (!priv->cpi)
                 res = VPX_CODEC_MEM_ERROR;
-            else
-                ctx->priv->alg_priv->cpi = optr;
         }
     }
 
@@ -727,24 +693,30 @@
 
     free(ctx->cx_data);
     vp8_remove_compressor(&ctx->cpi);
-    free(ctx);
+    vpx_free(ctx);
     return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
                                        YV12_BUFFER_CONFIG  *yv12)
 {
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
     vpx_codec_err_t        res = VPX_CODEC_OK;
     yv12->y_buffer = img->planes[VPX_PLANE_Y];
     yv12->u_buffer = img->planes[VPX_PLANE_U];
     yv12->v_buffer = img->planes[VPX_PLANE_V];
 
-    yv12->y_crop_width  = img->d_w;
-    yv12->y_crop_height = img->d_h;
-    yv12->y_width  = img->d_w;
-    yv12->y_height = img->d_h;
-    yv12->uv_width = (1 + yv12->y_width) / 2;
-    yv12->uv_height = (1 + yv12->y_height) / 2;
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
 
     yv12->y_stride = img->stride[VPX_PLANE_Y];
     yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -1273,6 +1245,9 @@
 
         320,                /* g_width */
         240,                /* g_height */
+        VPX_BITS_8,         /* g_bit_depth */
+        8,                  /* g_input_bit_depth */
+
         {1, 30},            /* g_timebase */
 
         0,                  /* g_error_resilient */
@@ -1341,10 +1316,10 @@
     vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
     vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
     {
-        NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
-        NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_peek_si_fn_t    peek_si; */
+        NULL,    /* vpx_codec_get_si_fn_t     get_si; */
+        NULL,    /* vpx_codec_decode_fn_t     decode; */
+        NULL,    /* vpx_codec_frame_get_fn_t  frame_get; */
     },
     {
         1,                  /* 1 cfg map */
@@ -1352,7 +1327,7 @@
         vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
         vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
         vp8e_set_config,
-        NOT_IMPLEMENTED,
+        NULL,
         vp8e_get_preview,
         vp8e_mr_alloc_mem,
     } /* encoder functions */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9a0cdb7..3ab8ed0 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -80,30 +80,30 @@
 
 static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
 {
-    ctx->priv =
-        (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
-    vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
-    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
-    ctx->priv->alg_priv->decrypt_cb = NULL;
-    ctx->priv->alg_priv->decrypt_state = NULL;
-    ctx->priv->alg_priv->flushed = 0;
+    vpx_codec_alg_priv_t *priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
 
+    priv->si.sz = sizeof(priv->si);
+    priv->decrypt_cb = NULL;
+    priv->decrypt_state = NULL;
+    priv->flushed = 0;
+
     if (ctx->config.dec)
     {
         /* Update the reference to the config structure to an internal copy. */
-        ctx->priv->alg_priv->cfg = *ctx->config.dec;
-        ctx->config.dec = &ctx->priv->alg_priv->cfg;
+        priv->cfg = *ctx->config.dec;
+        ctx->config.dec = &priv->cfg;
     }
 }
 
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data)
 {
-    vpx_codec_err_t        res = VPX_CODEC_OK;
+    vpx_codec_err_t res = VPX_CODEC_OK;
+    vpx_codec_alg_priv_t *priv = NULL;
     (void) data;
 
     vp8_rtcd();
@@ -115,29 +115,30 @@
     if (!ctx->priv)
     {
         vp8_init_ctx(ctx);
+        priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
         /* initialize number of fragments to zero */
-        ctx->priv->alg_priv->fragments.count = 0;
+        priv->fragments.count = 0;
         /* is input fragments enabled? */
-        ctx->priv->alg_priv->fragments.enabled =
-                (ctx->priv->alg_priv->base.init_flags &
-                    VPX_CODEC_USE_INPUT_FRAGMENTS);
+        priv->fragments.enabled =
+            (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
 
         /*post processing level initialized to do nothing */
     }
+    else
+    {
+        priv = (vpx_codec_alg_priv_t *)ctx->priv;
+    }
 
-    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
-            (ctx->priv->alg_priv->base.init_flags &
-                    VPX_CODEC_USE_FRAME_THREADING);
+    priv->yv12_frame_buffers.use_frame_threads =
+        (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
 
     /* for now, disable frame threading */
-    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0;
+    priv->yv12_frame_buffers.use_frame_threads = 0;
 
-    if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads &&
-            (( ctx->priv->alg_priv->base.init_flags &
-                            VPX_CODEC_USE_ERROR_CONCEALMENT)
-                    || ( ctx->priv->alg_priv->base.init_flags &
-                            VPX_CODEC_USE_INPUT_FRAGMENTS) ) )
+    if (priv->yv12_frame_buffers.use_frame_threads &&
+        ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS)))
     {
         /* row-based threading, error concealment, and input fragments will
          * not be supported when using frame-based threading */
@@ -566,17 +567,23 @@
 static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
                                        YV12_BUFFER_CONFIG  *yv12)
 {
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
     vpx_codec_err_t        res = VPX_CODEC_OK;
     yv12->y_buffer = img->planes[VPX_PLANE_Y];
     yv12->u_buffer = img->planes[VPX_PLANE_U];
     yv12->v_buffer = img->planes[VPX_PLANE_V];
 
-    yv12->y_crop_width  = img->d_w;
-    yv12->y_crop_height = img->d_h;
-    yv12->y_width  = img->d_w;
-    yv12->y_height = img->d_h;
-    yv12->uv_width = yv12->y_width / 2;
-    yv12->uv_height = yv12->y_height / 2;
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
 
     yv12->y_stride = img->stride[VPX_PLANE_Y];
     yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -809,15 +816,15 @@
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
-        NOT_IMPLEMENTED,
+        NULL,
     },
     { /* encoder functions */
         0,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL
     }
 };
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 5733048..551271e 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -36,11 +36,9 @@
 #File list for neon
 # encoder
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/subtract_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
 
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index b379656..21ae8d5 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -34,7 +34,7 @@
 
   cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
 
   cm->mb_cols = (cm->mi_cols + 1) >> 1;
   cm->mb_rows = (cm->mi_rows + 1) >> 1;
@@ -60,16 +60,18 @@
 
   for (i = 0; i < 2; ++i) {
     cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
+        (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO));
     if (cm->mip_array[i] == NULL)
       return 1;
 
     cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+        (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
     if (cm->mi_grid_base_array[i] == NULL)
       return 1;
   }
 
+  cm->mi_alloc_size = mi_size;
+
   // Init the index.
   cm->mi_idx = 0;
   cm->prev_mi_idx = 1;
@@ -131,7 +133,8 @@
   vp9_free_context_buffers(cm);
 
   vp9_set_mb_mi(cm, width, height);
-  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail;
+  if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
+    goto fail;
 
   cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
   if (!cm->last_frame_seg_map) goto fail;
@@ -174,7 +177,11 @@
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     cm->frame_bufs[i].ref_count = 0;
     if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
-                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
+                               ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
@@ -182,6 +189,9 @@
 
 #if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             cm->use_highbitdepth,
+#endif
                              VP9_ENC_BORDER_IN_PIXELS) < 0)
     goto fail;
 #endif
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 951e6e0..7d4ed95 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -21,6 +21,7 @@
 #include "vp9/common/vp9_common_data.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -176,7 +177,7 @@
 };
 
 struct macroblockd_plane {
-  int16_t *dqcoeff;
+  tran_low_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -223,11 +224,17 @@
   /* mc buffer */
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+#endif
+
   int lossless;
 
   int corrupted;
 
-  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 2788e66..5587192 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -64,6 +64,11 @@
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(cm, lval, expr) do { \
   lval = (expr); \
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index d776313..8817fdb 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -40,12 +40,6 @@
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-typedef enum BIT_DEPTH {
-  BITS_8,
-  BITS_10,
-  BITS_12
-} BIT_DEPTH;
-
 typedef enum BLOCK_SIZE {
   BLOCK_4X4,
   BLOCK_4X8,
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 856d41e..b196fc5 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,14 +18,47 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
+#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
+// overflow wrapping to match expected hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x) (x)
+#endif  // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
+                                    tran_low_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
+                                            tran_high_t trans, int bd) {
+  trans = WRAPLOW(trans);
+  switch (bd) {
+    case 8:
+    default:
+      return clamp_high(WRAPLOW(dest + trans), 0, 255);
+    case 10:
+      return clamp_high(WRAPLOW(dest + trans), 0, 1023);
+    case 12:
+      return clamp_high(WRAPLOW(dest + trans), 0, 4095);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
   int i;
-  int16_t output[16];
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
 
   for (i = 0; i < 4; i++) {
     a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -70,12 +103,12 @@
   }
 }
 
-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
   int i;
-  int a1, e1;
-  int16_t tmp[4];
-  const int16_t *ip = in;
-  int16_t *op = tmp;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
@@ -96,9 +129,9 @@
   }
 }
 
-static void idct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
+static void idct4(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -116,11 +149,11 @@
   output[3] = step[0] - step[3];
 }
 
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[4], temp_out[4];
+  tran_low_t temp_in[4], temp_out[4];
 
   // Rows
   for (i = 0; i < 4; ++i) {
@@ -140,10 +173,11 @@
   }
 }
 
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+                         int dest_stride) {
   int i;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
@@ -156,9 +190,9 @@
   }
 }
 
-static void idct8(const int16_t *input, int16_t *output) {
-  int16_t step1[8], step2[8];
-  int temp1, temp2;
+static void idct8(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
   // stage 1
   step1[0] = input[0];
   step1[2] = input[4];
@@ -201,11 +235,11 @@
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
@@ -225,10 +259,10 @@
   }
 }
 
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
@@ -238,13 +272,13 @@
   }
 }
 
-static void iadst4(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void iadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  int x0 = input[0];
-  int x1 = input[1];
-  int x2 = input[2];
-  int x3 = input[3];
+  tran_high_t x0 = input[0];
+  tran_high_t x1 = input[1];
+  tran_high_t x2 = input[2];
+  tran_high_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
     output[0] = output[1] = output[2] = output[3] = 0;
@@ -280,7 +314,7 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   const transform_2d IHT_4[] = {
     { idct4, idct4  },  // DCT_DCT  = 0
@@ -290,9 +324,9 @@
   };
 
   int i, j;
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
-  int16_t temp_in[4], temp_out[4];
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
@@ -311,17 +345,17 @@
                                   + dest[j * stride + i]);
   }
 }
-static void iadst8(const int16_t *input, int16_t *output) {
+static void iadst8(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
     output[0] = output[1] = output[2] = output[3] = output[4]
@@ -395,12 +429,12 @@
   { iadst8, iadst8 }   // ADST_ADST = 3
 };
 
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   int i, j;
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];
 
   // inverse transform row vectors
@@ -421,11 +455,11 @@
   }
 }
 
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8] = { 0 };
-  int16_t *outptr = out;
+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
   // only first 4 row has non-zero coefs
@@ -446,9 +480,9 @@
   }
 }
 
-static void idct16(const int16_t *input, int16_t *output) {
-  int16_t step1[16], step2[16];
-  int temp1, temp2;
+static void idct16(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
 
   // stage 1
   step1[0] = input[0/2];
@@ -611,11 +645,12 @@
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
@@ -635,25 +670,26 @@
   }
 }
 
-static void iadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+static void iadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
@@ -813,12 +849,12 @@
   { iadst16, iadst16 }   // ADST_ADST = 3
 };
 
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   int i, j;
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];
 
   // Rows
@@ -839,11 +875,12 @@
   }
 }
 
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16] = { 0 };
-  int16_t *outptr = out;
+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -864,10 +901,10 @@
   }
 }
 
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
@@ -877,9 +914,9 @@
   }
 }
 
-static void idct32(const int16_t *input, int16_t *output) {
-  int16_t step1[32], step2[32];
-  int temp1, temp2;
+static void idct32(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
 
   // stage 1
   step1[0] = input[0];
@@ -1244,11 +1281,12 @@
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32];
-  int16_t *outptr = out;
+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[32], temp_out[32];
+  tran_low_t temp_in[32], temp_out[32];
 
   // Rows
   for (i = 0; i < 32; ++i) {
@@ -1265,7 +1303,7 @@
     if (zero_coeff[0] | zero_coeff[1])
       idct32(input, outptr);
     else
-      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
     outptr += 32;
   }
@@ -1281,11 +1319,12 @@
   }
 }
 
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32] = {0};
-  int16_t *outptr = out;
+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
   int i, j;
-  int16_t temp_in[32], temp_out[32];
+  tran_low_t temp_in[32], temp_out[32];
 
   // Rows
   // only upper-left 8x8 has non-zero coeff
@@ -1306,11 +1345,11 @@
   }
 }
 
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
-  int a1;
+  tran_high_t a1;
 
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
@@ -1322,7 +1361,8 @@
 }
 
 // idct
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   if (eob > 1)
     vp9_idct4x4_16_add(input, dest, stride);
   else
@@ -1330,14 +1370,16 @@
 }
 
 
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   if (eob > 1)
     vp9_iwht4x4_16_add(input, dest, stride);
   else
     vp9_iwht4x4_1_add(input, dest, stride);
 }
 
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -1354,7 +1396,7 @@
     vp9_idct8x8_64_add(input, dest, stride);
 }
 
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
@@ -1367,7 +1409,7 @@
     vp9_idct16x16_256_add(input, dest, stride);
 }
 
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
   if (eob == 1)
     vp9_idct32x32_1_add(input, dest, stride);
@@ -1379,7 +1421,7 @@
 }
 
 // iht
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT)
     vp9_idct4x4_add(input, dest, stride, eob);
@@ -1387,7 +1429,7 @@
     vp9_iht4x4_16_add(input, dest, stride, tx_type);
 }
 
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
@@ -1396,7 +1438,7 @@
   }
 }
 
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
@@ -1404,3 +1446,1433 @@
     vp9_iht16x16_256_add(input, dest, stride, tx_type);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
+    dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
+    dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
+    dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                              int dest_stride, int bd) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void) bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    high_idct4(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    high_idct4(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+  }
+}
+
+void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int dest_stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
+    dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
+    dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
+    dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
+    dest += dest_stride;
+  }
+}
+
+static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2 & stage 3 - even half
+  high_idct4(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 8; ++i) {
+    high_idct8(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    high_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
+                                        ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                        bd);
+  }
+}
+
+void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[0];
+  tran_high_t x1 = input[1];
+  tran_high_t x2 = input[2];
+  tran_high_t x3 = input[3];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3)) {
+    vpx_memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0));
+  output[1] = WRAPLOW(dct_const_round_shift(s1));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s3));
+}
+
+void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int tx_type, int bd) {
+  const high_transform_2d IHT_4[] = {
+    { high_idct4, high_idct4  },    // DCT_DCT  = 0
+    { high_iadst4, high_idct4 },    // ADST_DCT = 1
+    { high_idct4, high_iadst4 },    // DCT_ADST = 2
+    { high_iadst4, high_iadst4 }    // ADST_ADST = 3
+  };
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr, bd);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+  }
+}
+
+static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    vpx_memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_8[] = {
+  { high_idct8,  high_idct8  },  // DCT_DCT  = 0
+  { high_iadst8, high_idct8  },  // ADST_DCT = 1
+  { high_idct8,  high_iadst8 },  // DCT_ADST = 2
+  { high_iadst8, high_iadst8 }   // ADST_ADST = 3
+};
+
+void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const high_transform_2d ht = HIGH_IHT_8[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+  }
+}
+
+void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  // Only first 4 row has non-zero coefs.
+  for (i = 0; i < 4; ++i) {
+    high_idct8(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    high_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+  }
+}
+
+static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
+}
+
+void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 16; ++i) {
+    high_idct16(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    high_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    vpx_memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8  = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9  = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_16[] = {
+  { high_idct16,  high_idct16  },  // DCT_DCT  = 0
+  { high_iadst16, high_idct16  },  // ADST_DCT = 1
+  { high_idct16,  high_iadst16 },  // DCT_ADST = 2
+  { high_iadst16, high_iadst16 }   // ADST_ADST = 3
+};
+
+void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const high_transform_2d ht = HIGH_IHT_16[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    high_idct16(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    high_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[14] = WRAPLOW(step1[14]);
+  step2[15] = WRAPLOW(step1[15]);
+
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_low_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      high_idct32(input, outptr, bd);
+    else
+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    high_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff.
+  for (i = 0; i < 8; ++i) {
+    high_idct32(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    high_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  int a1;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+// idct
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  if (eob > 1)
+    vp9_high_idct4x4_16_add(input, dest, stride, bd);
+  else
+    vp9_high_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  if (eob > 1)
+    vp9_high_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    vp9_high_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  // DC only DCT coefficient
+  if (eob == 1) {
+    vp9_high_idct8x8_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vp9_high_idct8x8_10_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct8x8_64_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob, int bd) {
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to separate different cases.
+  // DC only DCT coefficient.
+  if (eob == 1) {
+    vp9_high_idct16x16_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vp9_high_idct16x16_10_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct16x16_256_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob, int bd) {
+  // Non-zero coeff only in upper-left 8x8
+  if (eob == 1) {
+    vp9_high_idct32x32_1_add(input, dest, stride, bd);
+  } else if (eob <= 34) {
+    vp9_high_idct32x32_34_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct32x32_1024_add(input, dest, stride, bd);
+  }
+}
+
+// iht
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT)
+    vp9_high_idct4x4_add(input, dest, stride, eob, bd);
+  else
+    vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_high_idct8x8_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
+  }
+}
+
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_high_idct16x16_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 7f595e1..694be3c 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -36,52 +36,69 @@
 #define dual_set_epi16(a, b) \
   _mm_set_epi16(b, b, b, b, a, a, a, a)
 
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
 //           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const int cospi_1_64  = 16364;
-static const int cospi_2_64  = 16305;
-static const int cospi_3_64  = 16207;
-static const int cospi_4_64  = 16069;
-static const int cospi_5_64  = 15893;
-static const int cospi_6_64  = 15679;
-static const int cospi_7_64  = 15426;
-static const int cospi_8_64  = 15137;
-static const int cospi_9_64  = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
+static const tran_high_t cospi_1_64  = 16364;
+static const tran_high_t cospi_2_64  = 16305;
+static const tran_high_t cospi_3_64  = 16207;
+static const tran_high_t cospi_4_64  = 16069;
+static const tran_high_t cospi_5_64  = 15893;
+static const tran_high_t cospi_6_64  = 15679;
+static const tran_high_t cospi_7_64  = 15426;
+static const tran_high_t cospi_8_64  = 15137;
+static const tran_high_t cospi_9_64  = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
 
 //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const int sinpi_1_9 = 5283;
-static const int sinpi_2_9 = 9929;
-static const int sinpi_3_9 = 13377;
-static const int sinpi_4_9 = 15212;
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
 
-static INLINE int dct_const_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+#elif CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
   // of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -91,32 +108,59 @@
   assert(INT16_MIN <= rv);
   assert(rv <= INT16_MAX);
 #endif
-  return (int16_t)rv;
+  return (tran_low_t)rv;
 }
 
-typedef void (*transform_1d)(const int16_t*, int16_t*);
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
 
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+typedef struct {
+  high_transform_1d cols, rows;  // vertical and horizontal
+} high_transform_2d;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
                        eob);
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob);
 
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                     int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob);
 
-
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 3b39d42..4d03c4d 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1193,7 +1193,7 @@
   }
 }
 
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           VP9_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
@@ -1247,9 +1247,8 @@
                        y_only);
 }
 
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
-  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
-  (void)arg2;
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                        lf_data->start, lf_data->stop, lf_data->y_only);
   return 1;
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 6fa2773..69e7dd0 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -111,13 +111,13 @@
                            int y_only, int partial_frame);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           struct VP9Common *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only);
 
 typedef struct LoopFilterWorkerData {
-  const YV12_BUFFER_CONFIG *frame_buffer;
+  YV12_BUFFER_CONFIG *frame_buffer;
   struct VP9Common *cm;
   struct macroblockd_plane planes[MAX_MB_PLANE];
 
@@ -129,8 +129,8 @@
   int num_lf_workers;
 } LFWorkerData;
 
-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 3eb7f9d..5d89da8 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -34,6 +34,14 @@
   int32_t col;
 } MV32;
 
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return  *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
 static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
                             int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index dff077c..637867a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -84,6 +84,10 @@
   int subsampling_x;
   int subsampling_y;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+#endif
+
   YV12_BUFFER_CONFIG *frame_to_show;
 
   RefCntBuffer frame_bufs[FRAME_BUFFERS];
@@ -137,6 +141,7 @@
 
   int mi_idx;
   int prev_mi_idx;
+  int mi_alloc_size;
   MODE_INFO *mip_array[2];
   MODE_INFO **mi_grid_base_array[2];
 
@@ -178,8 +183,8 @@
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
 
-  // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2
-  BIT_DEPTH bit_depth;
+  // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+  vpx_bit_depth_t bit_depth;
 
 #if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
@@ -207,7 +212,7 @@
     return NULL;
   if (cm->ref_frame_map[index] < 0)
     return NULL;
-  assert(cm->ref_frame_map[index] < REF_FRAMES);
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
   return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
@@ -275,6 +280,11 @@
   }
 }
 
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MI_BLOCK_SIZE;
+}
+
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh,
                                   int mi_col, int bw,
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index abda4e6..e4e6ce7 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -366,6 +366,9 @@
                            unsigned int width, unsigned int height, int pitch) {
   unsigned int i, j;
 
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
   for (i = 0; i < height; i++) {
     uint8_t *pos = start + i * pitch;
     char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 403e105..471929a 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -9,11 +9,9 @@
  */
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
-
-#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -292,32 +290,32 @@
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
 
-static intra_pred_fn pred[INTRA_MODES][4];
-static intra_pred_fn dc_pred[2][2][4];
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
 
-static void init_intra_pred_fn_ptrs(void) {
-#define intra_pred_allsizes(l, type) \
-  l[0] = vp9_##type##_predictor_4x4; \
-  l[1] = vp9_##type##_predictor_8x8; \
-  l[2] = vp9_##type##_predictor_16x16; \
-  l[3] = vp9_##type##_predictor_32x32
+void vp9_init_intra_predictors() {
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vp9_##type##_predictor_4x4; \
+  p[TX_8X8] = vp9_##type##_predictor_8x8; \
+  p[TX_16X16] = vp9_##type##_predictor_16x16; \
+  p[TX_32X32] = vp9_##type##_predictor_32x32
 
-  intra_pred_allsizes(pred[V_PRED], v);
-  intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D207_PRED], d207);
-  intra_pred_allsizes(pred[D45_PRED], d45);
-  intra_pred_allsizes(pred[D63_PRED], d63);
-  intra_pred_allsizes(pred[D117_PRED], d117);
-  intra_pred_allsizes(pred[D135_PRED], d135);
-  intra_pred_allsizes(pred[D153_PRED], d153);
-  intra_pred_allsizes(pred[TM_PRED], tm);
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[D207_PRED], d207);
+  INIT_ALL_SIZES(pred[D45_PRED], d45);
+  INIT_ALL_SIZES(pred[D63_PRED], d63);
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
 
-  intra_pred_allsizes(dc_pred[0][0], dc_128);
-  intra_pred_allsizes(dc_pred[0][1], dc_top);
-  intra_pred_allsizes(dc_pred[1][0], dc_left);
-  intra_pred_allsizes(dc_pred[1][1], dc);
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
 
-#undef intra_pred_allsizes
+#undef INIT_ALL_SIZES
 }
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
@@ -343,8 +341,6 @@
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
 
-  once(init_intra_pred_fn_ptrs);
-
   // Get current frame pointer, width and height.
   if (plane == 0) {
     frame_width = xd->cur_buf->y_width;
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index d09d2a1..845f3bc 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+void vp9_init_intra_predictors();
+
 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 708f41b..32bcf9a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -6,6 +6,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
 
 struct macroblockd;
 
@@ -45,6 +46,13 @@
   $avx_x86_64 = $avx2_x86_64 = '';
 }
 
+# optimizations which depend on multiple features
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+} else {
+  $avx2_ssse3 = '';
+}
+
 #
 # RECON
 #
@@ -268,7 +276,7 @@
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down mmx sse2/;
+specialize qw/vp9_mbpost_proc_down sse2/;
 $vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
 
 add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
@@ -276,23 +284,14 @@
 $vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
 
 add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise mmx sse2/;
+specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
 }
 
-add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_inner/;
-
-add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_outer/;
-
-add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_b/;
-
 #
 # Sub Pixel Filters
 #
@@ -305,15 +304,15 @@
 $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
 $vp9_convolve8_neon_asm=vp9_convolve8_neon;
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
 $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
 $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -331,68 +330,177 @@
 #
 # dct
 #
-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_1_add/;
 
-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_16_add/;
 
-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_1_add/;
 
-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_64_add/;
 
-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_12_add/;
 
-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_1_add/;
 
-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_256_add/;
 
-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_10_add/;
 
-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1024_add/;
 
-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_34_add/;
 
-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1_add/;
 
-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht4x4_16_add/;
 
-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht8x8_64_add/;
 
-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/vp9_iht16x16_256_add/;
+
+  # dct and add
+
+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_1_add/;
+
+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_16_add/;
+} else {
+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+  $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+  $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+  $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+  $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+  $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+  $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
+  $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
+  $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+
+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+  $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+
+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+  $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+
+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+  # dct and add
+
+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_1_add/;
+
+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_16_add/;
+}
+
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+#
+# dct
+#
+add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_1_add/;
+
+add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_16_add/;
+
+add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_1_add/;
+
+add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_64_add/;
+
+add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_10_add/;
+
+add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_1_add/;
+
+add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_256_add/;
+
+add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_10_add/;
+
+add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1024_add/;
+
+add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_34_add/;
+
+add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1_add/;
+
+add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht4x4_16_add/;
+
+add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht8x8_64_add/;
+
+add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+specialize qw/vp9_high_iht16x16_256_add/;
 
 # dct and add
 
-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_1_add/;
+add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_1_add/;
 
-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_16_add/;
+add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_16_add/;
+}
 
 #
 # Encoder functions below this point.
@@ -420,19 +528,19 @@
 specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance16x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance8x16/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
@@ -444,7 +552,7 @@
 specialize qw/vp9_variance4x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance4x4/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -545,16 +653,16 @@
 specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad16x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad8x16/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad8x4/, "$sse2_x86inc";
@@ -563,7 +671,7 @@
 specialize qw/vp9_sad4x8/, "$sse_x86inc";
 
 add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
+specialize qw/vp9_sad4x4/, "$sse_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
@@ -693,38 +801,57 @@
 specialize qw/vp9_sad4x4x4d sse/;
 
 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/;
+specialize qw/vp9_mse8x16/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/;
+specialize qw/vp9_mse16x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/;
+specialize qw/vp9_mse8x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss mmx sse2/;
+specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 # ENCODEMB INVOKE
 
-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
-
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
 
-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for  vp9_block_error can no longer be used.
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error/;
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp/;
 
-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/;
 
-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/;
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+} else {
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error avx2/;
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+}
 
 #
 # Structured Similarity (SSIM)
@@ -738,44 +865,86 @@
 }
 
 # fdct functions
-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2/;
 
-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4/;
 
-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2/;
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8/;
 
-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16/;
 
-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4_1 sse2/;
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/;
 
-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2/;
+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4_1/;
 
-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2 neon/;
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4/;
 
-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8_1/;
 
-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16_1 sse2/;
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8/;
 
-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2/;
+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16_1/;
 
-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_1 sse2/;
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16/;
 
-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32 sse2 avx2/;
+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_1/;
 
-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32/;
+
+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_rd/;
+} else {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2/;
+
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2/;
+
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2/;
+
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+
+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4_1 sse2/;
+
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4 sse2/;
+
+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8_1 sse2 neon/;
+
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16_1 sse2/;
+
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16 sse2/;
+
+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_1 sse2/;
+
+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+}
 
 #
 # Motion search
@@ -797,6 +966,654 @@
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2/;
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+  # variance
+  add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance4x4/;
+
+  add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_get8x8var/;
+
+  add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance4x4/;
+
+  add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_10_get8x8var/;
+
+  add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_10_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance4x4/;
+
+  add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_12_get8x8var/;
+
+  add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_12_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad64x64/;
+
+  add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad32x64/;
+
+  add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad64x32/;
+
+  add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad32x16/;
+
+  add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad16x32/;
+
+  add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad32x32/;
+
+  add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad16x16/;
+
+  add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad16x8/;
+
+  add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad8x16/;
+
+  add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad8x8/;
+
+  add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad8x4/;
+
+  add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad4x8/;
+
+  add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad4x4/;
+
+  add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad64x64_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x64_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad64x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x4_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad4x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad4x4_avg/;
+
+  add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x64x3/;
+
+  add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x32x3/;
+
+  add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x16x3/;
+
+  add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x8x3/;
+
+  add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x16x3/;
+
+  add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x8x3/;
+
+  add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x4x3/;
+
+  add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad64x64x8/;
+
+  add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad32x32x8/;
+
+  add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad16x16x8/;
+
+  add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad16x8x8/;
+
+  add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x16x8/;
+
+  add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x8x8/;
+
+  add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x4x8/;
+
+  add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad4x8x8/;
+
+  add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad4x4x8/;
+
+  add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x64x4d/;
+
+  add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x64x4d/;
+
+  add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x32x4d/;
+
+  add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x16x4d/;
+
+  add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x32x4d/;
+
+  add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x32x4d/;
+
+  add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x16x4d/;
+
+  add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x8x4d/;
+
+  add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x16x4d/;
+
+  add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x8x4d/;
+
+  # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+  add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x4x4d/;
+
+  add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x8x4d/;
+
+  add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x4x4d/;
+
+  add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse8x8/;
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_high_block_error/;
+
+  add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vp9_high_subtract_block/;
+
+  add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_fp/;
+
+  add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_fp_32x32/;
+
+  add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_b/;
+
+  add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_b_32x32/;
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vp9_high_ssim_parms_8x8/;
+
+    add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";
+    specialize qw/vp9_high_ssim_parms_8x8_shift/;
+  }
+
+  # fdct functions
+  add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht4x4/;
+
+  add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht8x8/;
+
+  add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht16x16/;
+
+  add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fwht4x4/;
+
+  add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct4x4/;
+
+  add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct8x8_1/;
+
+  add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct8x8/;
+
+  add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct16x16_1/;
+
+  add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct16x16/;
+
+  add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32_1/;
+
+  add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32/;
+
+  add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32_rd/;
+
+  add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_high_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
+
 }
 # end encoder functions
 1;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 1b4904c..b6847b9 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -139,25 +139,25 @@
                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
   } \
 }
-#if HAVE_AVX2
+#if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if (ARCH_X86_64)
+#if ARCH_X86_64
 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else  // ARCH_X86
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif
+#endif  // ARCH_X86_64 / ARCH_X86
 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
@@ -190,9 +190,9 @@
 //                          const int16_t *filter_y, int y_step_q4,
 //                          int w, int h);
 FUN_CONV_2D(, avx2);
-#endif
+#endif  // HAVE_AX2 && HAVE_SSSE3
 #if HAVE_SSSE3
-#if (ARCH_X86_64)
+#if ARCH_X86_64
 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
@@ -204,14 +204,14 @@
 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else  // ARCH_X86
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif
+#endif  // ARCH_X86_64 / ARCH_X86
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -270,7 +270,7 @@
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
 FUN_CONV_2D(avg_ , ssse3);
-#endif
+#endif  // HAVE_SSSE3
 
 #if HAVE_SSE2
 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
@@ -336,4 +336,4 @@
 //                             int w, int h);
 FUN_CONV_2D(, sse2);
 FUN_CONV_2D(avg_ , sse2);
-#endif
+#endif  // HAVE_SSE2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index b60f8a0..df60987 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -3573,6 +3573,7 @@
                                  int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
 
   // idct constants for each stage
   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
@@ -3635,7 +3636,6 @@
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
-  int zero_flag[2];
 
   for (i = 0; i < 4; i++) {
     i32 = (i << 5);
@@ -3710,13 +3710,7 @@
       zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
       zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
 
-      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
-      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
-      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
-      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
-      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
-
-      if (!zero_flag[0] && !zero_flag[1]) {
+      if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
         col[i32 + 0] = _mm_setzero_si128();
         col[i32 + 1] = _mm_setzero_si128();
         col[i32 + 2] = _mm_setzero_si128();
@@ -3795,7 +3789,6 @@
       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     }
   for (i = 0; i < 4; i++) {
-      const __m128i zero = _mm_setzero_si128();
       // Second 1-D idct
       j = i << 3;
 
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
deleted file mode 100644
index 5b8deef..0000000
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ /dev/null
@@ -1,533 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
-sym(vp9_post_proc_down_and_across_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movq        mm0, [GLOBAL(rd)]
-    sub         rsp, 8
-    movq        [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
-        push        rbx
-        lea         rbx, [GLOBAL(Blur)]
-        movd        mm2, dword ptr arg(6) ;flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx, DWORD PTR arg(4) ;rows
-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        neg         rax
-        movq        mm6, [rbx ]           ; kernel 0 taps
-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]       ; kernel 1 taps
-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-
-        movd        [rdi], mm1            ;
-        neg         rax                   ; pitch is positive
-
-
-        add         rsi, 4
-        add         rdi, 4
-        add         rdx, 4
-
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .nextcol
-        ; done with the all cols, start the across filtering in place
-        sub         rsi, rdx
-        sub         rdi, rdx
-
-
-        push        rax
-        xor         rdx,    rdx
-        mov         rax,    [rdi-4];
-
-.acrossnextcol:
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ;
-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
-        movq        mm3, mm4              ; mm3 = p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]
-        psrlq       mm4, 8                ; mm4 = p1..p7
-        movq        mm5, mm4              ; mm5 = p1..p7
-        punpcklbw   mm5, mm0              ; mm5 = p1..p4
-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = p0..p3
-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]
-        psrlq       mm4, 8                ; mm4 = p2..p7
-        movq        mm5, mm4              ; mm5 = p2..p7
-        punpcklbw   mm5, mm0              ; mm5 = p2..p5
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        movq        mm6, [rbx ]
-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
-        movq        mm5, mm4              ; mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]
-        psrlq       mm4, 8                ; mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
-        movd        eax,    mm1
-
-        add         rdx, 4
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .acrossnextcol;
-
-        mov         DWORD PTR [rdi+rdx-4],  eax
-        pop         rax
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow               ; next row
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-;                             int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE
-sym(vp9_mbpost_proc_down_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 136
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vp9_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword ptr arg(2), 8
-
-    ;for(c=0; c<cols; c+=4)
-.loop_col:
-            mov         rsi,        arg(0)  ;s
-            pxor        mm0,        mm0     ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-            neg         rax                                     ; rax = -pitch
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-
-            pxor        mm5,        mm5
-            pxor        mm6,        mm6     ;
-
-            pxor        mm7,        mm7     ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movd        mm1,        DWORD PTR [rdi];
-            punpcklbw   mm1,        mm0     ;
-
-            paddw       mm5,        mm1     ;
-            pmullw      mm1,        mm1     ;
-
-            movq        mm2,        mm1     ;
-            punpcklwd   mm1,        mm0     ;
-
-            punpckhwd   mm2,        mm0     ;
-            paddd       mm6,        mm1     ;
-
-            paddd       mm7,        mm2     ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
-            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   mm1,        mm0
-            punpcklbw   mm2,        mm0
-
-            paddw       mm5,        mm2
-            psubw       mm5,        mm1
-
-            pmullw      mm2,        mm2
-            movq        mm4,        mm2
-
-            punpcklwd   mm2,        mm0
-            punpckhwd   mm4,        mm0
-
-            paddd       mm6,        mm2
-            paddd       mm7,        mm4
-
-            pmullw      mm1,        mm1
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm0
-            psubd       mm6,        mm1
-
-            punpckhwd   mm2,        mm0
-            psubd       mm7,        mm2
-
-
-            movq        mm3,        mm6
-            pslld       mm3,        4
-
-            psubd       mm3,        mm6
-            movq        mm1,        mm5
-
-            movq        mm4,        mm5
-            pmullw      mm1,        mm1
-
-            pmulhw      mm4,        mm4
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm4
-            punpckhwd   mm2,        mm4
-
-            movq        mm4,        mm7
-            pslld       mm4,        4
-
-            psubd       mm4,        mm7
-
-            psubd       mm3,        mm1
-            psubd       mm4,        mm2
-
-            psubd       mm3,        flimit2
-            psubd       mm4,        flimit2
-
-            psrad       mm3,        31
-            psrad       mm4,        31
-
-            packssdw    mm3,        mm4
-            packsswb    mm3,        mm0
-
-            movd        mm1,        DWORD PTR [rsi+rax*8]
-
-            movq        mm2,        mm1
-            punpcklbw   mm1,        mm0
-
-            paddw       mm1,        mm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vp9_rv))]
-            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
-            movq        mm4,        [sym(vp9_rv) + rcx*2]
-%endif
-            paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
-            psraw       mm1,        4
-
-            packuswb    mm1,        mm0
-            pand        mm1,        mm3
-
-            pandn       mm3,        mm2
-            por         mm1,        mm3
-
-            and         rcx,        15
-            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-
-            and         rcx,        15
-            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
-            movd        [rsi],      mm1
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-
-        add         dword arg(0), 4 ; s += 4
-        sub         dword arg(3), 4 ; cols -= 4
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 136
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_mmx) PRIVATE
-sym(vp9_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-Blur:
-    times 16 dw 16
-    times  8 dw 64
-    times 16 dw 16
-    times  8 dw  0
-
-rd:
-    times 4 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index d109e13..3bc7d39 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -307,7 +307,7 @@
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   unsigned int i;
   unsigned int src_stride, dst_stride;
@@ -409,35 +409,35 @@
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
      srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
-
 
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-
-     // multiply 2 adjacent elements with the filter and add the result
      srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
-
-     // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
 
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
 
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
      srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 0797168..7615cdd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,7 +195,7 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
     TX_TYPE tx_type = DCT_DCT;
-    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     if (xd->lossless) {
       tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -330,6 +330,9 @@
   if (!vp9_is_valid_scale(&ref_buffer->sf))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
+  if (ref_buffer->buf->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Block reference is corrupt");
   vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
                        &ref_buffer->sf);
   xd->corrupted |= ref_buffer->buf->corrupted;
@@ -627,16 +630,17 @@
                        "Width and height beyond allowed size.");
 #endif
   if (cm->width != width || cm->height != height) {
-    const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-    const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2;
 
-    // Change in frame size (assumption: color format does not change).
-    if (cm->width == 0 || cm->height == 0 ||
-        aligned_width > cm->width ||
-        aligned_width * aligned_height > cm->width * cm->height) {
+    // Allocations in vp9_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffers");
+                           "Failed to allocate context buffers");
     } else {
       vp9_set_mb_mi(cm, width, height);
     }
@@ -654,12 +658,25 @@
 
   if (vp9_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
           &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
           cm->cb_priv)) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          vpx_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
 }
 
 static void setup_frame_size_with_refs(VP9_COMMON *cm,
@@ -672,6 +689,10 @@
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
+      if (buf->corrupted) {
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Frame reference is corrupt");
+      }
       found = 1;
       break;
     }
@@ -695,18 +716,35 @@
   if (!has_valid_ref_frame)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth,
+            ref_frame->buf->uv_crop_width < ref_frame->buf->y_crop_width,
+            ref_frame->buf->uv_crop_height < ref_frame->buf->y_crop_height,
+            cm->bit_depth,
+            cm->subsampling_x,
+            cm->subsampling_y))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color space");
+  }
 
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
 
   if (vp9_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
           &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
           cm->cb_priv)) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
+  cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
 }
 
 static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -814,6 +852,8 @@
 
   if (cm->lf.filter_level) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    // Be sure to sync as we might be resuming after a failed frame decode.
+    winterface->sync(&pbi->lf_worker);
     lf_data->frame_buffer = get_frame_new_buffer(cm);
     lf_data->cm = cm;
     vp9_copy(lf_data->planes, pbi->mb.plane);
@@ -883,7 +923,7 @@
         pbi->mb.corrupted |= tile_data->xd.corrupted;
       }
       // Loopfilter one row.
-      if (cm->lf.filter_level) {
+      if (cm->lf.filter_level && !pbi->mb.corrupted) {
         const int lf_start = mi_row - MI_BLOCK_SIZE;
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
@@ -906,7 +946,7 @@
   }
 
   // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level) {
+  if (cm->lf.filter_level && !pbi->mb.corrupted) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     winterface->sync(&pbi->lf_worker);
     lf_data->start = lf_data->stop;
@@ -920,9 +960,8 @@
   return vp9_reader_find_end(&tile_data->bit_reader);
 }
 
-static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
-  const TileInfo *const tile = (TileInfo*)arg2;
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            const TileInfo *const tile) {
   int mi_row, mi_col;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
@@ -995,6 +1034,7 @@
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
+    winterface->sync(&pbi->tile_workers[n]);
     pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
   }
 
@@ -1098,7 +1138,7 @@
 static void read_bitdepth_colorspace_sampling(
     VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   if (cm->profile >= PROFILE_2)
-    cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
+    cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
   cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
   if (cm->color_space != SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
@@ -1142,6 +1182,7 @@
                          "Invalid frame marker");
 
   cm->profile = vp9_read_profile(rb);
+
   if (cm->profile >= MAX_PROFILES)
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Unsupported bitstream profile");
@@ -1181,6 +1222,7 @@
     }
 
     setup_frame_size(cm, rb);
+    pbi->need_resync = 0;
   } else {
     cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
 
@@ -1204,6 +1246,7 @@
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
+      pbi->need_resync = 0;
     } else {
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1232,6 +1275,12 @@
     }
   }
 
+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
   if (!cm->error_resilient_mode) {
     cm->refresh_frame_context = vp9_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
@@ -1400,7 +1449,7 @@
 
   if (!first_partition_size) {
     // showing a frame directly
-    *p_data_end = data + 1;
+    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
     return;
   }
 
@@ -1431,9 +1480,11 @@
   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
       cm->frame_parallel_decoding_mode) {
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    // If multiple threads are used to decode tiles, then we use those threads
-    // to do parallel loopfiltering.
-    vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+    if (!xd->corrupted) {
+      // If multiple threads are used to decode tiles, then we use those threads
+      // to do parallel loopfiltering.
+      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+    }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 2a2f0f5..6ee3d70 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -25,6 +25,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -36,7 +37,9 @@
   static int init_done = 0;
 
   if (!init_done) {
+    vp9_rtcd();
     vp9_init_neighbors();
+    vp9_init_intra_predictors();
     init_done = 1;
   }
 }
@@ -57,15 +60,15 @@
   }
 
   cm->error.setjmp = 1;
+  pbi->need_resync = 1;
   initialize_dec();
 
-  vp9_rtcd();
-
   // Initialize the references to not point to any frame buffers.
   vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
 
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
+  cm->bit_depth = VPX_BITS_8;
 
   // vp9_init_dequantizer() is first called here. Add check in
   // frame_init_dequantizer() to avoid unnecessary calling of
@@ -96,10 +99,8 @@
   }
   vpx_free(pbi->tile_workers);
 
-  if (pbi->num_tile_workers) {
-    const int sb_rows =
-        mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-    vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows);
+  if (pbi->num_tile_workers > 0) {
+    vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
   vp9_remove_common(cm);
@@ -123,8 +124,12 @@
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return VPX_CODEC_ERROR;
+    }
     if (!equal_dimensions(cfg, sd))
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
@@ -234,6 +239,7 @@
   cm->new_fb_idx = get_free_fb(cm);
 
   if (setjmp(cm->error.jmp)) {
+    pbi->need_resync = 1;
     cm->error.setjmp = 0;
     vp9_clear_system_state();
 
@@ -310,3 +316,67 @@
   vp9_clear_system_state();
   return ret;
 }
+
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 223b66f..4f52bb9 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -58,6 +58,7 @@
 
   int max_threads;
   int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -78,6 +79,25 @@
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+                                  void *decrypt_state,
+                                  const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 91cdf38..76ca1ae 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -51,7 +51,7 @@
   } while (0)
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
-                       int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                       tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                        int ctx, const int16_t *scan, const int16_t *nb,
                        vp9_reader *r) {
   const int max_eob = 16 << (tx_size << 1);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 5dda49a..6635880 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -121,10 +121,10 @@
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+static int loop_filter_row_worker(TileWorkerData *const tile_data,
+                                  void *unused) {
   LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void) arg2;
+  (void)unused;
   loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                       lf_data->start, lf_data->stop, lf_data->y_only,
                       lf_data->lf_sync, lf_data->num_lf_workers);
@@ -145,24 +145,13 @@
   const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
   int i;
 
-  // Allocate memory used in thread synchronization.
-  // This always needs to be done even if frame_filter_level is 0.
-  if (!cm->current_video_frame || cm->last_height != cm->height) {
-    if (cm->last_height != cm->height) {
-      const int aligned_last_height =
-          ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
-      const int last_sb_rows =
-          mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
-          MI_BLOCK_SIZE_LOG2;
-
-      vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
-    }
-
-    vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
-  }
-
   if (!frame_filter_level) return;
 
+  if (!lf_sync->sync_range || cm->last_height != cm->height) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
+  }
+
   vp9_loop_filter_frame_init(cm, frame_filter_level);
 
   // Initialize cur_sb_col to -1 for all SB rows.
@@ -225,21 +214,24 @@
 }
 
 // Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
                            int width) {
+  lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
-  int i;
+  {
+    int i;
 
-  CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                  vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-  for (i = 0; i < rows; ++i) {
-    pthread_mutex_init(&lf_sync->mutex_[i], NULL);
-  }
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+    }
 
-  CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                  vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-  for (i = 0; i < rows; ++i) {
-    pthread_cond_init(&lf_sync->cond_[i], NULL);
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+    for (i = 0; i < rows; ++i) {
+      pthread_cond_init(&lf_sync->cond_[i], NULL);
+    }
   }
 #endif  // CONFIG_MULTITHREAD
 
@@ -251,23 +243,19 @@
 }
 
 // Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
-#if !CONFIG_MULTITHREAD
-  (void)rows;
-#endif  // !CONFIG_MULTITHREAD
-
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
   if (lf_sync != NULL) {
 #if CONFIG_MULTITHREAD
     int i;
 
     if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < rows; ++i) {
+      for (i = 0; i < lf_sync->rows; ++i) {
         pthread_mutex_destroy(&lf_sync->mutex_[i]);
       }
       vpx_free(lf_sync->mutex_);
     }
     if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < rows; ++i) {
+      for (i = 0; i < lf_sync->rows; ++i) {
         pthread_cond_destroy(&lf_sync->cond_[i]);
       }
       vpx_free(lf_sync->cond_);
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index 423bd88..b1fbdeb 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -38,14 +38,15 @@
   // The optimal sync_range for different resolution and platform should be
   // determined by testing. Currently, it is chosen to be a power-of-2 number.
   int sync_range;
+  int rows;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
-                           int rows, int width);
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+                           int width);
 
 // Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows);
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
 
 // Multi-threaded loopfilter that uses the tile threads.
 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 2d5ec79..8c13d0d 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -28,7 +28,6 @@
                           int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                           int zbin_oq_value, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  int i;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
@@ -39,7 +38,7 @@
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-
+    int i;
     const int16x8_t v_zero = vdupq_n_s16(0);
     const int16x8_t v_one = vdupq_n_s16(1);
     int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
@@ -50,13 +49,12 @@
     v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
     v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
     v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-
-    for (i = 0; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
       const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
-      const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
       const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
                                            vget_low_s16(v_quant));
       const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
@@ -65,20 +63,39 @@
                                             vshrn_n_s32(v_tmp_hi, 16));
       const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
       const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan =
-          vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
       const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
       const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
       const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-
       v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
       v_round = vmovq_n_s16(round_ptr[1]);
       v_quant = vmovq_n_s16(quant_ptr[1]);
       v_dequant = vmovq_n_s16(dequant_ptr[1]);
     }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
     {
       const int16x4_t v_eobmax_3210 =
           vmax_s16(vget_low_s16(v_eobmax_76543210),
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b0ff0fa..b605248 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -294,6 +294,7 @@
       vp9_write_token(w, vp9_switchable_interp_tree,
                       cm->fc.switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mbmi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
     } else {
       assert(mbmi->interp_filter == cm->interp_filter);
     }
@@ -421,11 +422,13 @@
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  const MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  const MODE_INFO *m = NULL;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
   partition = partition_lookup[bsl][m->mbmi.sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
@@ -668,8 +671,6 @@
   vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
   vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
 
-  vp9_clear_system_state();
-
   for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
     build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
                             frame_coef_probs[tx_size]);
@@ -996,8 +997,10 @@
 
     // Set "found" to 0 for temporal svc and for spatial svc key frame
     if (cpi->use_svc &&
-        (cpi->svc.number_spatial_layers == 1 ||
-         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
+        ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->oxcf.rc_mode == VPX_CBR) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
       found = 0;
     }
     vp9_wb_write_bit(wb, found);
@@ -1043,8 +1046,8 @@
 static void write_bitdepth_colorspace_sampling(
     VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
   if (cm->profile >= PROFILE_2) {
-    assert(cm->bit_depth > BITS_8);
-    vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+    assert(cm->bit_depth > VPX_BITS_8);
+    vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
   vp9_wb_write_literal(wb, cm->color_space, 3);
   if (cm->color_space != SRGB) {
@@ -1081,7 +1084,16 @@
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
   } else {
-    if (!cm->show_frame)
+    // In spatial svc if it's not error_resilient_mode then we need to code all
+    // visible frames as invisible. But we need to keep the show_frame flag so
+    // that the publisher could know whether it is supposed to be visible.
+    // So we will code the show_frame flag as it is. Then code the intra_only
+    // bit here. This will make the bitstream incompatible. In the player we
+    // will change to show_frame flag to 0, then add an one byte frame with
+    // show_existing_frame flag which tells the decoder which frame we want to
+    // show.
+    if (!cm->show_frame ||
+        (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0))
       vp9_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 8e82d1c..b488261 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -26,7 +26,7 @@
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
          cpi->rc.is_src_frame_alt_ref &&
          (!cpi->use_svc ||      // Add spatial svc base layer case here
-          (is_spatial_svc(cpi) &&
+          (is_two_pass_svc(cpi) &&
            cpi->svc.spatial_layer_id == 0 &&
            cpi->svc.layer_context[0].gold_ref_idx >=0 &&
            cpi->oxcf.ss_play_alternate[0]));
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index bd3b0fd..767bd7f 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,8 +28,8 @@
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  int16_t *qcoeff;
-  int16_t *coeff;
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
   uint16_t *eobs;
   struct buf_2d src;
 
@@ -76,16 +76,12 @@
   int pred_mv_sad[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
   int *nmvcost[2];
-  int nmvcosts_hp[2][MV_VALS];
   int *nmvcost_hp[2];
   int **mvcost;
 
   int nmvjointsadcost[MV_JOINTS];
-  int nmvsadcosts[2][MV_VALS];
   int *nmvsadcost[2];
-  int nmvsadcosts_hp[2][MV_VALS];
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
@@ -116,15 +112,19 @@
   int quant_fp;
 
   // skip forward transform and quantization
-  int skip_txfm[MAX_MB_PLANE];
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
 
-  int64_t bsse[MAX_MB_PLANE];
+  int64_t bsse[MAX_MB_PLANE << 2];
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
-  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+  void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob, int bd);
+#endif
 };
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 9b7a932..12acc51 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
       CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
       ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index d60e6c3..97f0741 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -19,21 +19,24 @@
 typedef struct {
   MODE_INFO mic;
   uint8_t *zcoeff_blk;
-  int16_t *coeff[MAX_MB_PLANE][3];
-  int16_t *qcoeff[MAX_MB_PLANE][3];
-  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  tran_low_t *coeff[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
   uint16_t *eobs[MAX_MB_PLANE][3];
 
   // dual buffer pointers, 0: in use, 1: best in store
-  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
   uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
 
   int is_coded;
   int num_4x4_blk;
   int skip;
-  int skip_txfm[MAX_MB_PLANE];
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 0.
+  int skippable;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 59222f0..eff8996 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,15 +18,17 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-static INLINE int fdct_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert
+  // and make the bounds consts.
+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
   return rv;
 }
 
-static void fdct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
 
   step[0] = input[0] + input[3];
   step[1] = input[1] + input[2];
@@ -43,9 +45,9 @@
   output[3] = fdct_round_shift(temp2);
 }
 
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
   for (r = 0; r < 4; ++r)
     for (c = 0; c < 4; ++c)
       sum += input[r * stride + c];
@@ -54,7 +56,7 @@
   output[1] = 0;
 }
 
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -63,22 +65,23 @@
   // in normal/row positions).
   int pass;
   // We need an intermediate buffer between passes.
-  int16_t intermediate[4 * 4];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int input[4];
-    /*canbe16*/ int step[4];
-    /*needs32*/ int temp1, temp2;
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 4; ++i) {
       // Load inputs.
       if (0 == pass) {
-        input[0] = in[0 * stride] * 16;
-        input[1] = in[1 * stride] * 16;
-        input[2] = in[2 * stride] * 16;
-        input[3] = in[3 * stride] * 16;
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
         if (i == 0 && input[0]) {
           input[0] += 1;
         }
@@ -102,6 +105,7 @@
       out[1] = fdct_round_shift(temp1);
       out[3] = fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
       in++;
       out += 4;
     }
@@ -119,9 +123,9 @@
   }
 }
 
-static void fadst4(const int16_t *input, int16_t *output) {
-  int x0, x1, x2, x3;
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   x0 = input[0];
   x1 = input[1];
@@ -166,15 +170,15 @@
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct4x4_c(input, output, stride);
   } else {
-    int16_t out[4 * 4];
-    int16_t *outptr = &out[0];
+    tran_low_t out[4 * 4];
+    tran_low_t *outptr = &out[0];
     int i, j;
-    int16_t temp_in[4], temp_out[4];
+    tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
 
     // Columns
@@ -199,10 +203,10 @@
   }
 }
 
-static void fdct8(const int16_t *input, int16_t *output) {
-  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-  /*needs32*/ int t0, t1, t2, t3;
-  /*canbe16*/ int x0, x1, x2, x3;
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
 
   // stage 1
   s0 = input[0] + input[7];
@@ -251,9 +255,9 @@
   output[7] = fdct_round_shift(t3);
 }
 
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
   for (r = 0; r < 8; ++r)
     for (c = 0; c < 8; ++c)
       sum += input[r * stride + c];
@@ -262,16 +266,16 @@
   output[1] = 0;
 }
 
-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   int i, j;
-  int16_t intermediate[64];
+  tran_low_t intermediate[64];
 
   // Transform columns
   {
-    int16_t *output = intermediate;
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
 
     int i;
     for (i = 0; i < 8; i++) {
@@ -333,9 +337,9 @@
   }
 }
 
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
   for (r = 0; r < 16; ++r)
     for (c = 0; c < 16; ++c)
       sum += input[r * stride + c];
@@ -344,7 +348,7 @@
   output[1] = 0;
 }
 
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
@@ -353,37 +357,38 @@
   // in normal/row positions).
   int pass;
   // We need an intermediate buffer between passes.
-  int16_t intermediate[256];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int step1[8];
-    /*canbe16*/ int step2[8];
-    /*canbe16*/ int step3[8];
-    /*canbe16*/ int input[8];
-    /*needs32*/ int temp1, temp2;
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 16; i++) {
       if (0 == pass) {
         // Calculate input for the first 8 results.
-        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
-        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
-        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
-        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
-        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
-        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
-        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
-        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
-        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
-        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
-        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
-        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
-        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
-        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
-        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
       } else {
         // Calculate input for the first 8 results.
         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -406,9 +411,9 @@
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-        /*needs32*/ int t0, t1, t2, t3;
-        /*canbe16*/ int x0, x1, x2, x3;
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
 
         // stage 1
         s0 = input[0] + input[7];
@@ -514,6 +519,7 @@
       }
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
+      in_pass0++;
       out += 16;
     }
     // Setup in/out for next pass.
@@ -522,17 +528,17 @@
   }
 }
 
-static void fadst8(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
 
   // stage 1
   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
@@ -600,15 +606,15 @@
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct8x8_c(input, output, stride);
   } else {
-    int16_t out[64];
-    int16_t *outptr = &out[0];
+    tran_low_t out[64];
+    tran_low_t *outptr = &out[0];
     int i, j;
-    int16_t temp_in[8], temp_out[8];
+    tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
 
     // Columns
@@ -633,17 +639,18 @@
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    pixel. */
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   int i;
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0 * stride];
-    b1 = ip[1 * stride];
-    c1 = ip[2 * stride];
-    d1 = ip[3 * stride];
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
 
     a1 += b1;
     d1 = d1 - c1;
@@ -657,7 +664,7 @@
     op[8] = d1;
     op[12] = b1;
 
-    ip++;
+    ip_pass0++;
     op++;
   }
   ip = output;
@@ -687,12 +694,12 @@
 }
 
 // Rewrote to use same algorithm as others.
-static void fdct16(const int16_t in[16], int16_t out[16]) {
-  /*canbe16*/ int step1[8];
-  /*canbe16*/ int step2[8];
-  /*canbe16*/ int step3[8];
-  /*canbe16*/ int input[8];
-  /*needs32*/ int temp1, temp2;
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t input[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
 
   // step 1
   input[0] = in[0] + in[15];
@@ -715,9 +722,9 @@
 
   // fdct8(step, step);
   {
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
 
     // stage 1
     s0 = input[0] + input[7];
@@ -828,25 +835,26 @@
   out[15] = fdct_round_shift(temp2);
 }
 
-static void fadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
 
   // stage 1
   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
@@ -997,15 +1005,15 @@
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct16x16_c(input, output, stride);
   } else {
-    int16_t out[256];
-    int16_t *outptr = &out[0];
+    tran_low_t out[256];
+    tran_low_t *outptr = &out[0];
     int i, j;
-    int16_t temp_in[16], temp_out[16];
+    tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
 
     // Columns
@@ -1028,19 +1036,21 @@
   }
 }
 
-static INLINE int dct_32_round(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(-131072 <= rv && rv <= 131071);
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
   return rv;
 }
 
-static INLINE int half_round_shift(int input) {
-  int rv = (input + 1 + (input < 0)) >> 2;
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
   return rv;
 }
 
-static void fdct32(const int *input, int *output, int round) {
-  int step[32];
+static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
   step[1] = input[1] + input[(32 - 2)];
@@ -1362,9 +1372,9 @@
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
   for (r = 0; r < 32; ++r)
     for (c = 0; c < 32; ++c)
       sum += input[r * stride + c];
@@ -1373,13 +1383,13 @@
   output[1] = 0;
 }
 
-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   int i, j;
-  int output[32 * 32];
+  tran_high_t output[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
     fdct32(temp_in, temp_out, 0);
@@ -1389,7 +1399,7 @@
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
     fdct32(temp_in, temp_out, 0);
@@ -1401,13 +1411,13 @@
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
   int i, j;
-  int output[32 * 32];
+  tran_high_t output[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
     fdct32(temp_in, temp_out, 0);
@@ -1420,7 +1430,7 @@
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
     fdct32(temp_in, temp_out, 1);
@@ -1428,3 +1438,61 @@
       out[j + i * 32] = temp_out[j];
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  vp9_fdct4x4_c(input, output, stride);
+}
+
+void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
+                       int stride, int tx_type) {
+  vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vp9_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                        int stride) {
+  vp9_fdct8x8_c(input, final_output, stride);
+}
+
+void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vp9_fdct16x16_1_c(input, output, stride);
+}
+
+void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp9_fdct16x16_c(input, output, stride);
+}
+
+void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
+                    int stride, int tx_type) {
+  vp9_fht16x16_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct32x32_1_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct32x32_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                             int stride) {
+  vp9_fdct32x32_rd_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 90ea9cc..75b9449 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -78,7 +78,8 @@
                                              int mc_avg_stride,
                                              uint8_t *avg, int avg_stride,
                                              int increase_denoising,
-                                             BLOCK_SIZE bs) {
+                                             BLOCK_SIZE bs,
+                                             int motion_magnitude) {
   int r, c;
   const uint8_t *sig_start = sig;
   const uint8_t *mc_avg_start = mc_avg;
@@ -86,6 +87,19 @@
   int diff, adj, absdiff, delta;
   int adj_val[] = {3, 4, 6};
   int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
 
   // First attempt to apply a strong temporal denoising filter.
   for (r = 0; r < heights[bs]; ++r) {
@@ -130,7 +144,8 @@
   // Otherwise, we try to dampen the filter if the delta is not too high.
   delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
            >> 8) + 1;
-  if (delta > delta_thresh(bs, increase_denoising)) {
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
   }
 
@@ -145,11 +160,17 @@
         adj = delta;
       }
       if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
         avg[c] = MAX(0, avg[c] - adj);
-        total_adj += adj;
-      } else {
-        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
         total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
       }
     }
     sig += sig_stride;
@@ -185,7 +206,8 @@
                                                          int increase_denoising,
                                                          int mi_row,
                                                          int mi_col,
-                                                         PICK_MODE_CONTEXT *ctx
+                                                         PICK_MODE_CONTEXT *ctx,
+                                                         int *motion_magnitude
                                                          ) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -210,6 +232,8 @@
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
 
+  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
   frame = ctx->best_reference_frame;
 
   // If the best reference frame uses inter-prediction and there is enough of a
@@ -297,6 +321,7 @@
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
+  int motion_magnitude = 0;
   VP9_DENOISER_DECISION decision = FILTER_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -307,13 +332,14 @@
 
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
-                                         mi_row, mi_col, ctx);
+                                         mi_row, mi_col, ctx,
+                                         &motion_magnitude);
 
   if (decision == FILTER_BLOCK) {
     decision = denoiser_filter(src.buf, src.stride,
                                mc_avg_start, mc_avg.y_stride,
                                avg_start, avg.y_stride,
-                               0, bs);
+                               0, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
@@ -370,8 +396,8 @@
   ctx->newmv_sse = UINT_MAX;
 }
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
-                                     unsigned int sse, PREDICTION_MODE mode,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+                                     PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
   // TODO(tkopp): Use both MVs if possible
   if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
@@ -388,13 +414,21 @@
 }
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int ssx, int ssy, int border) {
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border) {
   int i, fail;
   assert(denoiser != NULL);
 
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
     fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy, border);
+                                  ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                  use_highbitdepth,
+#endif
+                                  border);
     if (fail) {
       vp9_denoiser_free(denoiser);
       return 1;
@@ -405,7 +439,11 @@
   }
 
   fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
-                                ssx, ssy, border);
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border);
   if (fail) {
     vp9_denoiser_free(denoiser);
     return 1;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index d93846f..fa714b1 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK
@@ -42,12 +44,16 @@
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
                                      unsigned int sse, PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int ssx, int ssy, int border);
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border);
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b20b662..c62b52f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -539,10 +539,9 @@
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
                                         mi_row, mi_col, bsize, 1);
-      vp9_init_plane_quantizers(cpi, x);
     }
   }
 
@@ -728,6 +727,7 @@
     p[i].eobs = ctx->eobs_pbuf[i][0];
   }
   ctx->is_coded = 0;
+  ctx->skippable = 0;
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
@@ -1233,30 +1233,23 @@
   }
 }
 
-static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
+static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
                          int mi_row, int mi_col) {
-  MACROBLOCK *x = &cpi->mb;
-  uint8_t *src, *pre;
-  int src_stride, pre_stride;
-
+  // This assumes the input source frames are of the same dimension.
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
-
+  const int x = mi_col * MI_SIZE;
+  const int y = mi_row * MI_SIZE;
+  const int src_stride = cpi->Source->y_stride;
+  const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
+  const int pre_stride = cpi->Last_Source->y_stride;
+  const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
   int this_sad = 0;
   int threshold = 0;
 
-  // This assumes the input source frames are of the same dimension.
-  src_stride = cpi->Source->y_stride;
-  src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
-            (mi_col * MI_SIZE);
-  pre_stride = cpi->Last_Source->y_stride;
-  pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
-          (mi_col * MI_SIZE);
-
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride);
+    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
     threshold = (1 << 12);
   } else {
     int r, c;
@@ -1267,8 +1260,7 @@
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
-  x->in_static_area = (this_sad < 2 * threshold);
-  return x->in_static_area;
+  return this_sad < 2 * threshold;
 }
 
 static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
@@ -1724,7 +1716,8 @@
 // function so repeat calls can accumulate a min and max of more than one sb64.
 static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
                                         BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size ) {
+                                        BLOCK_SIZE *max_block_size,
+                                        int bs_hist[BLOCK_SIZES]) {
   int sb_width_in_blocks = MI_BLOCK_SIZE;
   int sb_height_in_blocks  = MI_BLOCK_SIZE;
   int i, j;
@@ -1735,6 +1728,7 @@
     for (j = 0; j < sb_width_in_blocks; ++j) {
       MODE_INFO * mi = mi_8x8[index+j];
       BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+      bs_hist[sb_type]++;
       *min_block_size = MIN(*min_block_size, sb_type);
       *max_block_size = MAX(*max_block_size, sb_type);
     }
@@ -1767,6 +1761,9 @@
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
   BLOCK_SIZE max_size = BLOCK_64X64;
+  int i = 0;
+  int bs_hist[BLOCK_SIZES] = {0};
+
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
@@ -1779,22 +1776,51 @@
     if (cm->frame_type != KEY_FRAME) {
       MODE_INFO **const prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
     }
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
       MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
     }
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
       MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
     }
+
     // adjust observed min and max
     if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
       min_size = min_partition_size[min_size];
       max_size = max_partition_size[max_size];
+    } else if (cpi->sf.auto_min_max_partition_size ==
+               CONSTRAIN_NEIGHBORING_MIN_MAX) {
+      // adjust the search range based on the histogram of the observed
+      // partition sizes from left, above the previous co-located blocks
+      int sum = 0;
+      int first_moment = 0;
+      int second_moment = 0;
+      int var_unnormalized = 0;
+
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        sum += bs_hist[i];
+        first_moment += bs_hist[i] * i;
+        second_moment += bs_hist[i] * i * i;
+      }
+
+      // if variance is small enough,
+      // adjust the range around its mean size, which gives a tighter range
+      var_unnormalized = second_moment - first_moment * first_moment / sum;
+      if (var_unnormalized <= 4 * sum) {
+        int mean = first_moment / sum;
+        min_size = min_partition_size[mean];
+        max_size = max_partition_size[mean];
+      } else {
+        min_size = min_partition_size[min_size];
+        max_size = max_partition_size[max_size];
+      }
     }
   }
 
@@ -1811,6 +1837,7 @@
       next_square_size[max_size] < min_size) {
      min_size = next_square_size[max_size];
   }
+
   *min_block_size = min_size;
   *max_block_size = max_size;
 }
@@ -2061,17 +2088,6 @@
 
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
-    unsigned int source_variancey;
-    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
-    source_variancey = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-    if (source_variancey < cpi->sf.disable_split_var_thresh) {
-      do_split = 0;
-      if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
-        do_rect = 0;
-    }
-  }
-
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -2143,8 +2159,8 @@
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
 
       if (sum_rd < best_rd) {
-        int64_t stop_thresh = 4096;
-        int64_t stop_thresh_rd;
+        int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+        int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
 
         best_rate = this_rate;
         best_dist = this_dist;
@@ -2152,14 +2168,18 @@
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust threshold according to partition size.
-        stop_thresh >>= 8 - (b_width_log2(bsize) +
+        // Adjust dist breakout threshold according to the partition size.
+        dist_breakout_thr >>= 8 - (b_width_log2(bsize) +
             b_height_log2(bsize));
 
-        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
-        // If obtained distortion is very small, choose current partition
-        // and stop splitting.
-        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless &&
+            (ctx->skippable && best_dist < dist_breakout_thr &&
+            best_rate < rate_breakout_thr)) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2446,6 +2466,9 @@
     vp9_zero(cpi->mb.pred_mv);
     cpi->pc_root->index = 0;
 
+    // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
+    // quality encoding. Need to evaluate it in real-time encoding later to
+    // decide if it can be removed too. And then, do the code cleanup.
     if ((sf->partition_search_type == SEARCH_PARTITION &&
          sf->use_lastframe_partitioning) ||
          sf->partition_search_type == FIXED_PARTITION ||
@@ -2480,7 +2503,7 @@
           if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
             last_was_mid_sequence_overlay = 1;
         }
-        if ((cm->current_video_frame
+        if ((cpi->rc.frames_since_key
             % sf->last_partitioning_redo_frequency) == 0
             || last_was_mid_sequence_overlay
             || cm->prev_mi == 0
@@ -2581,35 +2604,15 @@
 }
 
 static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->mb.e_mbd.lossless) {
+  if (cpi->mb.e_mbd.lossless)
     return ONLY_4X4;
-  } else if (cpi->common.current_video_frame == 0) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return ALLOW_32X32;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
+           cpi->sf.tx_size_search_method == USE_TX_8X8)
     return TX_MODE_SELECT;
-  } else {
-    if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
-      return ALLOW_32X32;
-    } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-      const RD_OPT *const rd_opt = &cpi->rd;
-      const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
-      return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] >
-                 rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-                     ALLOW_32X32 : TX_MODE_SELECT;
-    } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
-      return TX_MODE_SELECT;
-    } else {
-      unsigned int total = 0;
-      int i;
-      for (i = 0; i < TX_SIZES; ++i)
-        total += cpi->tx_stepdown_count[i];
-
-      if (total) {
-        const double fraction = (double)cpi->tx_stepdown_count[0] / total;
-        return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
-      } else {
-        return cpi->common.tx_mode;
-      }
-    }
-  }
+  else
+    return cpi->common.tx_mode;
 }
 
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
@@ -3114,7 +3117,7 @@
         break;
       case REFERENCE_PARTITION:
         if (sf->partition_check ||
-            !is_background(cpi, tile, mi_row, mi_col)) {
+            !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
           set_modeinfo_offsets(cm, xd, mi_row, mi_col);
           auto_partition_range(cpi, tile, mi_row, mi_col,
                                &sf->min_partition_size,
@@ -3292,7 +3295,6 @@
 
   vp9_zero(cm->counts);
   vp9_zero(cpi->coef_counts);
-  vp9_zero(cpi->tx_stepdown_count);
   vp9_zero(rd_opt->comp_pred_diff);
   vp9_zero(rd_opt->filter_diff);
   vp9_zero(rd_opt->tx_select_diff);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 8a737e1..794e6d0 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -107,9 +107,9 @@
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   uint8_t token_cache[1024];
-  const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
@@ -294,22 +294,33 @@
 }
 
 static INLINE void fdct32x32(int rd_transform,
-                             const int16_t *src, int16_t *dst, int src_stride) {
+                             const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
   if (rd_transform)
     vp9_fdct32x32_rd(src, dst, src_stride);
   else
     vp9_fdct32x32(src, dst, src_stride);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
+                                  tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_high_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_high_fdct32x32(src, dst, src_stride);
+}
+#endif
+
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -357,9 +368,9 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -405,9 +416,9 @@
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
@@ -458,7 +469,7 @@
   struct optimize_ctx *const ctx = args->ctx;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int i, j;
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
@@ -476,20 +487,24 @@
   }
 
   if (!x->skip_recode) {
-    if (x->skip_txfm[plane] == 0) {
-      // full forward transform and quantization
-      if (x->quant_fp)
-        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-      else
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-    } else if (x->skip_txfm[plane] == 2) {
-      // fast path forward transform and quantization
-      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+    if (max_txsize_lookup[plane_bsize] == tx_size) {
+      if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
+        // full forward transform and quantization
+        if (x->quant_fp)
+          vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+        else
+          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
+        // fast path forward transform and quantization
+        vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      } else {
+        // skip forward transform
+        p->eobs[block] = 0;
+        *a = *l = 0;
+        return;
+      }
     } else {
-      // skip forward transform
-      p->eobs[block] = 0;
-      *a = *l = 0;
-      return;
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
     }
   }
 
@@ -534,7 +549,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int i, j;
   uint8_t *dst;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -583,9 +598,9 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *scan_order;
   TX_TYPE tx_type;
   PREDICTION_MODE mode;
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ad6db0..9d42a12 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -216,7 +216,7 @@
 
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
-  if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
+  if (cpi->sf.mv.auto_mv_step_size) {
     unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
     cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
   }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 524744c..b3884d0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -24,6 +24,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -128,11 +129,13 @@
   }
 
   if (cm->frame_type == KEY_FRAME) {
-    if (!is_spatial_svc(cpi))
+    if (!is_two_pass_svc(cpi))
       cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
+    vp9_zero(cpi->interp_filter_selected);
   } else {
     cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    vp9_zero(cpi->interp_filter_selected[0]);
   }
 }
 
@@ -140,7 +143,9 @@
   static int init_done = 0;
 
   if (!init_done) {
+    vp9_rtcd();
     vp9_init_neighbors();
+    vp9_init_intra_predictors();
     vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
     vp9_init_me_luts();
@@ -167,6 +172,26 @@
   vpx_free(cpi->complexity_map);
   cpi->complexity_map = NULL;
 
+  vpx_free(cpi->nmvcosts[0]);
+  vpx_free(cpi->nmvcosts[1]);
+  cpi->nmvcosts[0] = NULL;
+  cpi->nmvcosts[1] = NULL;
+
+  vpx_free(cpi->nmvcosts_hp[0]);
+  vpx_free(cpi->nmvcosts_hp[1]);
+  cpi->nmvcosts_hp[0] = NULL;
+  cpi->nmvcosts_hp[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts[0]);
+  vpx_free(cpi->nmvsadcosts[1]);
+  cpi->nmvsadcosts[0] = NULL;
+  cpi->nmvsadcosts[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts_hp[0]);
+  vpx_free(cpi->nmvsadcosts_hp[1]);
+  cpi->nmvsadcosts_hp[0] = NULL;
+  cpi->nmvsadcosts_hp[1] = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -212,8 +237,15 @@
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
   vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
+
+  vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+             MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+             MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+             MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+             MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
@@ -233,8 +265,15 @@
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
   vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+  vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
+             MV_VALS * sizeof(*cc->nmvcosts[0]));
+  vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
+             MV_VALS * sizeof(*cc->nmvcosts[1]));
+  vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+             MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+             MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
@@ -386,27 +425,15 @@
   }
 }
 
-
-static void set_speed_features(VP9_COMP *cpi) {
-#if CONFIG_INTERNAL_STATS
-  int i;
-  for (i = 0; i < MAX_MODES; ++i)
-    cpi->mode_chosen_counts[i] = 0;
-#endif
-
-  vp9_set_speed_features(cpi);
-
-  // Set rd thresholds based on mode and speed setting
-  vp9_set_rd_speed_thresholds(cpi);
-  vp9_set_rd_speed_thresholds_sub8x8(cpi);
-}
-
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
 
   cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      cm->use_highbitdepth,
+#endif
                                       oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -415,6 +442,9 @@
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
@@ -432,6 +462,9 @@
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
@@ -439,6 +472,9 @@
   if (vp9_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
@@ -446,6 +482,9 @@
   if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
@@ -474,10 +513,13 @@
   vp9_init_context_buffers(cm);
   init_macroblockd(cm, xd);
 
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
     if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif
                                  VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to reallocate alt_ref_buffer");
@@ -485,18 +527,10 @@
 }
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
-  cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate;
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
   vp9_rc_update_framerate(cpi);
 }
 
-int64_t vp9_rescale(int64_t val, int64_t num, int denom) {
-  int64_t llnum = num;
-  int64_t llden = denom;
-  int64_t llval = val;
-
-  return (llval * llnum / llden);
-}
-
 static void set_tile_limits(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -518,9 +552,13 @@
   VP9_COMMON *const cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
 
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
   cm->color_space = UNKNOWN;
 
   cm->width = oxcf->width;
@@ -532,10 +570,10 @@
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 &&
-      cpi->oxcf.mode == TWO_PASS_SECOND_BEST)) {
+  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass == 2)) {
     vp9_init_layer_context(cpi);
   }
 
@@ -550,6 +588,20 @@
   set_tile_limits(cpi);
 }
 
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const VP9EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+                                            : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+                                           : maximum * bandwidth / 1000;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -559,11 +611,16 @@
   cm->bit_depth = oxcf->bit_depth;
 
   if (cm->profile <= PROFILE_1)
-    assert(cm->bit_depth == BITS_8);
+    assert(cm->bit_depth == VPX_BITS_8);
   else
-    assert(cm->bit_depth > BITS_8);
+    assert(cm->bit_depth > VPX_BITS_8);
 
   cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->oxcf.use_highbitdepth) {
+    cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+  }
+#endif
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
@@ -583,35 +640,15 @@
   }
   cpi->encode_breakout = cpi->oxcf.encode_breakout;
 
-  // local file playback mode == really big buffer
-  if (cpi->oxcf.rc_mode == VPX_VBR) {
-    cpi->oxcf.starting_buffer_level_ms = 60000;
-    cpi->oxcf.optimal_buffer_level_ms = 60000;
-    cpi->oxcf.maximum_buffer_size_ms = 240000;
-  }
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
 
-  rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
-                                          cpi->oxcf.target_bandwidth, 1000);
-
-  // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level_ms == 0)
-    rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
-                                           cpi->oxcf.target_bandwidth, 1000);
-
-  if (cpi->oxcf.maximum_buffer_size_ms == 0)
-    rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
-                                          cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
   rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
-  vp9_new_framerate(cpi, cpi->oxcf.framerate);
+  vp9_new_framerate(cpi, cpi->framerate);
 
   // Set absolute upper and lower quality limits
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -633,7 +670,9 @@
 
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass == 2)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -656,6 +695,9 @@
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
                        cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
                        VP9_ENC_BORDER_IN_PIXELS);
   }
 #endif
@@ -722,19 +764,12 @@
 
   cm->error.setjmp = 1;
 
-  vp9_rtcd();
-
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
-
-  cpi->gold_is_last = 0;
-  cpi->alt_is_last = 0;
-  cpi->gold_is_alt = 0;
-
   cpi->skippable_frame = 0;
 
   // Create the encoder segmentation map and set all entries to 0
@@ -754,6 +789,23 @@
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                    sizeof(cpi->mbgraph_stats[0])); i++) {
     CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@@ -834,16 +886,16 @@
   cpi->first_time_stamp_ever = INT64_MAX;
 
   cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+  cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
   cal_nmvsadcosts(cpi->mb.nmvsadcost);
 
-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+  cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -860,8 +912,6 @@
   kf_list = fopen("kf_list.stt", "w");
 #endif
 
-  cpi->output_pkt_list = oxcf->output_pkt_list;
-
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (oxcf->pass == 1) {
@@ -871,7 +921,7 @@
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
     if (cpi->svc.number_spatial_layers > 1
-        && cpi->svc.number_temporal_layers == 1) {
+        || cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
       int i;
@@ -929,7 +979,7 @@
     }
   }
 
-  set_speed_features(cpi);
+  vp9_set_speed_features(cpi);
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -1012,10 +1062,6 @@
       vp9_sub_pixel_avg_variance4x4,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
-  cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = vp9_diamond_search_sad;
-  cpi->refining_search_sad = vp9_refining_search_sad;
-
   /* vp9_init_quantizer() is first called here. Add check in
    * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
@@ -1234,7 +1280,10 @@
     pkt.data.psnr.psnr[i] = psnr.psnr[i];
   }
   pkt.kind = VPX_CODEC_PSNR_PKT;
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  if (is_two_pass_svc(cpi))
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr;
+  else
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
 
 int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -1418,40 +1467,6 @@
   vp9_extend_frame_borders(dst);
 }
 
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
 static int recode_loop_test(const VP9_COMP *cpi,
@@ -1517,7 +1532,7 @@
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
 
-    if (is_spatial_svc(cpi)) {
+    if (is_two_pass_svc(cpi)) {
       cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
       cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
     }
@@ -1531,17 +1546,32 @@
 
       ref_cnt_fb(cm->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+                 cpi->interp_filter_selected[0],
+                 sizeof(cpi->interp_filter_selected[0]));
     }
 
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(cm->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+      if (!cpi->rc.is_src_frame_alt_ref)
+        vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+                   cpi->interp_filter_selected[0],
+                   sizeof(cpi->interp_filter_selected[0]));
+      else
+        vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+                   cpi->interp_filter_selected[ALTREF_FRAME],
+                   sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
   }
 
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(cm->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    if (!cpi->rc.is_src_frame_alt_ref)
+      vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME],
+                 cpi->interp_filter_selected[0],
+                 sizeof(cpi->interp_filter_selected[0]));
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1596,6 +1626,9 @@
       vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
       scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
       cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -1770,7 +1803,6 @@
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
-      cpi->dummy_packing = 1;
       if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
@@ -1919,36 +1951,26 @@
   } while (loop);
 }
 
-static void get_ref_frame_flags(VP9_COMP *cpi) {
-  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_last = 0;
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 0;
+  if (gold_is_last)
+    flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 1;
-  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 0;
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+    flags &= ~VP9_GOLD_FLAG;
 
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+  if (alt_is_last)
+    flags &= ~VP9_ALT_FLAG;
 
-  if (cpi->gold_is_last)
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+  if (gold_is_alt)
+    flags &= ~VP9_ALT_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
-      !is_spatial_svc(cpi))
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-
-  if (cpi->alt_is_last)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
-  if (cpi->gold_is_alt)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+  return flags;
 }
 
 static void set_ext_overrides(VP9_COMP *cpi) {
@@ -1980,18 +2002,16 @@
   }
 }
 
-static void configure_skippable_frame(VP9_COMP *cpi) {
+static int is_skippable_frame(const VP9_COMP *cpi) {
   // If the current frame does not have non-zero motion vector detected in the
   // first  pass, and so do its previous and forward frames, then this frame
   // can be skipped for partition check, and the partition size is assigned
   // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
 
-  SVC *const svc = &cpi->svc;
-  TWO_PASS *const twopass = is_spatial_svc(cpi) ?
-                            &svc->layer_context[svc->spatial_layer_id].twopass
-                            : &cpi->twopass;
-
-  cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
+  return (!frame_is_intra_only(&cpi->common) &&
     twopass->stats_in - 2 > twopass->stats_in_start &&
     twopass->stats_in < twopass->stats_in_end &&
     (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
@@ -2017,19 +2037,69 @@
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param =
+            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+
+int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+  INTERP_FILTER ifilter;
+  int ref_total[MAX_REF_FRAMES] = {0};
+  MV_REFERENCE_FRAME ref;
+  int mask = 0;
+  if (cpi->common.last_frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame)
+    return mask;
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+    if ((ref_total[LAST_FRAME] &&
+        cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+        (ref_total[GOLDEN_FRAME] == 0 ||
+         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+           < ref_total[GOLDEN_FRAME]) &&
+        (ref_total[ALTREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+           < ref_total[ALTREF_FRAME]))
+      mask |= 1 << ifilter;
+  }
+  return mask;
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
   int q;
   int top_index;
   int bottom_index;
 
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
-  struct segmentation *const seg = &cm->seg;
   set_ext_overrides(cpi);
 
   cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
@@ -2055,24 +2125,13 @@
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  // Initialize cpi->mv_step_param to default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->mv.auto_mv_step_size) {
-    if (frame_is_intra_only(cm)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(MIN(max_mv_def, 2 *
-                                 cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
+  set_mv_search_params(cpi);
+
+  if (cpi->oxcf.pass == 2 &&
+      cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
+
 
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {
@@ -2088,9 +2147,7 @@
     // The alternate reference frame cannot be active for a key frame.
     cpi->rc.source_alt_ref_active = 0;
 
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
 
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
@@ -2098,29 +2155,59 @@
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
     } else if (cm->intra_only) {
+      cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
       // Only reset the current context.
       cm->reset_frame_context = 2;
     }
   }
+  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+    cm->frame_context_idx =
+        cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+        cpi->svc.temporal_layer_id;
+
+    // The probs will be updated based on the frame type of its previous
+    // frame if frame_parallel_decoding_mode is 0. The type may vary for
+    // the frame after a key frame in base layer since we may drop enhancement
+    // layers. So set frame_parallel_decoding_mode to 1 in this case.
+    if (cpi->svc.number_temporal_layers == 1) {
+      if (cpi->svc.spatial_layer_id == 0 &&
+          cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+        cm->frame_parallel_decoding_mode = 1;
+      else
+        cm->frame_parallel_decoding_mode = 0;
+    } else if (cpi->svc.spatial_layer_id == 0) {
+      // Find the 2nd frame in temporal base layer and 1st frame in temporal
+      // enhancement layers from the key frame.
+      int i;
+      for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+        if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+          cm->frame_parallel_decoding_mode = 1;
+          break;
+        }
+      }
+      if (i == cpi->svc.number_temporal_layers)
+        cm->frame_parallel_decoding_mode = 0;
+    }
+  }
 
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
-  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
 
   // Check if the current frame is skippable for the partition search in the
   // second pass according to the first pass stats
-  if (cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
-    configure_skippable_frame(cpi);
+  if (oxcf->pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    cpi->skippable_frame = is_skippable_frame(cpi);
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_CBR &&
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (vp9_rc_drop_frame(cpi)) {
       vp9_rc_postencode_update_drop_frame(cpi);
@@ -2132,9 +2219,9 @@
   vp9_clear_system_state();
 
 #if CONFIG_VP9_POSTPROC
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (oxcf->noise_sensitivity > 0) {
     int l = 0;
-    switch (cpi->oxcf.noise_sensitivity) {
+    switch (oxcf->noise_sensitivity) {
       case 1:
         l = 20;
         break;
@@ -2156,7 +2243,18 @@
   }
 #endif
 
-  set_speed_features(cpi);
+#if CONFIG_INTERNAL_STATS
+  {
+    int i;
+    for (i = 0; i < MAX_MODES; ++i)
+      cpi->mode_chosen_counts[i] = 0;
+  }
+#endif
+
+  vp9_set_speed_features(cpi);
+
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
 
   // Decide q and q bounds.
   q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
@@ -2175,7 +2273,7 @@
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (oxcf->noise_sensitivity > 0) {
     vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                             yuv_denoised_file);
   }
@@ -2196,29 +2294,10 @@
 
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
   // build the bitstream
-  cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
   if (cm->seg.update_map)
@@ -2250,7 +2329,7 @@
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-  get_ref_frame_flags(cpi);
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
   cm->last_frame_type = cm->frame_type;
   vp9_rc_postencode_update(cpi, *size);
@@ -2277,8 +2356,12 @@
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  if (!cm->show_existing_frame)
-    cm->last_show_frame = cm->show_frame;
+  if (!cm->show_existing_frame) {
+    if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)
+      cm->last_show_frame = 0;
+    else
+      cm->last_show_frame = cm->show_frame;
+  }
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
@@ -2287,8 +2370,12 @@
     // update not a real frame
     ++cm->current_video_frame;
     if (cpi->use_svc)
-      vp9_inc_frame_in_layer(&cpi->svc);
+      vp9_inc_frame_in_layer(cpi);
   }
+
+  if (is_two_pass_svc(cpi))
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
+        cm->frame_type;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -2361,7 +2448,7 @@
   vpx_usec_timer_start(&timer);
 
 #if CONFIG_SPATIAL_SVC
-  if (is_spatial_svc(cpi))
+  if (is_two_pass_svc(cpi))
     res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
                                  frame_flags);
   else
@@ -2403,18 +2490,19 @@
          cm->seg.update_data;
 }
 
-void adjust_frame_rate(VP9_COMP *cpi) {
+void adjust_frame_rate(VP9_COMP *cpi,
+                       const struct lookahead_entry *source) {
   int64_t this_duration;
   int step = 0;
 
-  if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
-    this_duration = cpi->source->ts_end - cpi->source->ts_start;
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
     step = 1;
   } else {
     int64_t last_duration = cpi->last_end_time_stamp_seen
         - cpi->last_time_stamp_seen;
 
-    this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
 
     // do a step update if the duration changes by 10%
     if (last_duration)
@@ -2428,17 +2516,17 @@
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
-      const double interval = MIN((double)(cpi->source->ts_end
+      const double interval = MIN((double)(source->ts_end
                                    - cpi->first_time_stamp_ever), 10000000.0);
-      double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+      double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
 
       vp9_new_framerate(cpi, 10000000.0 / avg_duration);
     }
   }
-  cpi->last_time_stamp_seen = cpi->source->ts_start;
-  cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
 }
 
 // Returns 0 if this is not an alt ref else the offset of the source frame
@@ -2459,7 +2547,8 @@
   return arf_src_index;
 }
 
-static void check_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi,
+                             const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   if (cpi->oxcf.pass == 2) {
@@ -2468,7 +2557,7 @@
       (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
   } else {
     rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
-                               (cpi->source == cpi->alt_ref_source);
+                               (source == cpi->alt_ref_source);
   }
 
   if (rc->is_src_frame_alt_ref) {
@@ -2484,18 +2573,18 @@
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
   MV_REFERENCE_FRAME ref_frame;
   int arf_src_index;
 
-  if (!cpi)
-    return -1;
-
-  if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) {
+  if (is_two_pass_svc(cpi) && oxcf->pass == 2) {
 #if CONFIG_SPATIAL_SVC
     vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
 #endif
@@ -2504,9 +2593,6 @@
 
   vpx_usec_timer_start(&cmptimer);
 
-  cpi->source = NULL;
-  cpi->last_source = NULL;
-
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
   // Normal defaults
@@ -2522,21 +2608,20 @@
     assert(arf_src_index <= rc->frames_to_key);
 
 #if CONFIG_SPATIAL_SVC
-    if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
-                                           arf_src_index, 0);
+    if (is_two_pass_svc(cpi))
+      source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0);
     else
 #endif
-      cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
-    if (cpi->source != NULL) {
-      cpi->alt_ref_source = cpi->source;
+      source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
+    if (source != NULL) {
+      cpi->alt_ref_source = source;
 
 #if CONFIG_SPATIAL_SVC
-      if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
+      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
         int i;
         // Reference a hidden frame from a lower layer
         for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (cpi->oxcf.ss_play_alternate[i]) {
+          if (oxcf->ss_play_alternate[i]) {
             cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
             break;
           }
@@ -2545,7 +2630,7 @@
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
 
-      if (cpi->oxcf.arnr_max_frames > 0) {
+      if (oxcf->arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer);
@@ -2563,62 +2648,57 @@
     }
   }
 
-  if (!cpi->source) {
+  if (!source) {
     // Get last frame source.
     if (cm->current_video_frame > 0) {
 #if CONFIG_SPATIAL_SVC
-      if (is_spatial_svc(cpi))
-        cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
+      if (is_two_pass_svc(cpi))
+        last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
       else
 #endif
-        cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
-      if (cpi->last_source == NULL)
+        last_source = vp9_lookahead_peek(cpi->lookahead, -1);
+      if (last_source == NULL)
         return -1;
     }
 
     // Read in the source frame.
 #if CONFIG_SPATIAL_SVC
-    if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+    if (is_two_pass_svc(cpi))
+      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
 #endif
-      cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
-    if (cpi->source != NULL) {
+      source = vp9_lookahead_pop(cpi->lookahead, flush);
+    if (source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
 
       // Check to see if the frame should be encoded as an arf overlay.
-      check_src_altref(cpi);
+      check_src_altref(cpi, source);
     }
   }
 
-  if (cpi->source) {
+  if (source) {
     cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
-                                                           : &cpi->source->img;
+                                                           : &source->img;
 
-    if (cpi->last_source != NULL) {
-      cpi->unscaled_last_source = &cpi->last_source->img;
-    } else {
-      cpi->unscaled_last_source = NULL;
-    }
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
 
-    *time_stamp = cpi->source->ts_start;
-    *time_end = cpi->source->ts_end;
-    *frame_flags =
-        (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   } else {
     *size = 0;
-    if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
       vp9_end_first_pass(cpi);    /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
     return -1;
   }
 
-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
   }
 
   // Clear down mmx registers
@@ -2626,11 +2706,11 @@
 
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
-    adjust_frame_rate(cpi);
+    adjust_frame_rate(cpi, source);
   }
 
   if (cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) {
+      oxcf->rc_mode == VPX_CBR) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
@@ -2647,7 +2727,7 @@
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
-    } else if (cpi->oxcf.pass == 2) {
+    } else if (oxcf->pass == 2) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
     }
@@ -2655,19 +2735,22 @@
 
   cpi->frame_flags = *frame_flags;
 
-  if (cpi->oxcf.pass == 2 &&
+  if (oxcf->pass == 2 &&
       cm->current_video_frame == 0 &&
-      cpi->oxcf.allow_spatial_resampling &&
-      cpi->oxcf.rc_mode == VPX_VBR) {
+      oxcf->allow_spatial_resampling &&
+      oxcf->rc_mode == VPX_VBR) {
     // Internal scaling is triggered on the first frame.
-    vp9_set_size_literal(cpi, cpi->oxcf.scaled_frame_width,
-                         cpi->oxcf.scaled_frame_height);
+    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
   }
 
   // Reset the frame pointers to the current frame size
   vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
                            VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
 
   alloc_util_frame_buffers(cpi);
@@ -2689,18 +2772,27 @@
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+  if (oxcf->aq_mode == VARIANCE_AQ) {
     vp9_vaq_init();
   }
 
-  if (cpi->oxcf.pass == 1 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
-    const int lossless = is_lossless_requested(&cpi->oxcf);
+  if (oxcf->pass == 1 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->oxcf.use_highbitdepth)
+      cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+    else
+      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :
+                                       vp9_high_idct4x4_add;
+#else
     cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif
     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-    vp9_first_pass(cpi);
-  } else if (cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_spatial_svc(cpi))) {
+    vp9_first_pass(cpi, source);
+  } else if (oxcf->pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -2723,20 +2815,22 @@
 
   // Save layer specific state.
   if ((cpi->svc.number_temporal_layers > 1 &&
-      cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+       oxcf->rc_mode == VPX_CBR) ||
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
-  if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame)
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
 #if CONFIG_INTERNAL_STATS
 
-  if (cpi->oxcf.pass != 1) {
+  if (oxcf->pass != 1) {
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
@@ -2776,12 +2870,12 @@
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
 
-          frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
-          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
@@ -2797,6 +2891,7 @@
         }
       }
 
+
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
         frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c841da2..80774de 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -82,36 +82,19 @@
 } VPX_SCALING;
 
 typedef enum {
-  // Good Quality Fast Encoding. The encoder balances quality with the
-  // amount of time it takes to encode the output. (speed setting
-  // controls how fast)
-  ONE_PASS_GOOD = 1,
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
 
-  // One Pass - Best Quality. The encoder places priority on the
-  // quality of the output over encoding speed. The output is compressed
-  // at the highest possible quality. This option takes the longest
-  // amount of time to encode. (speed setting ignored)
-  ONE_PASS_BEST = 2,
+  // The encoder places priority on the quality of the output over encoding
+  // speed. The output is compressed at the highest possible quality. This
+  // option takes the longest amount of time to encode. Speed setting ignored.
+  BEST,
 
-  // Two Pass - First Pass. The encoder generates a file of statistics
-  // for use in the second encoding pass. (speed setting controls how fast)
-  TWO_PASS_FIRST = 3,
-
-  // Two Pass - Second Pass. The encoder uses the statistics that were
-  // generated in the first encoding pass to create the compressed
-  // output. (speed setting controls how fast)
-  TWO_PASS_SECOND_GOOD = 4,
-
-  // Two Pass - Second Pass Best.  The encoder uses the statistics that
-  // were generated in the first encoding pass to create the compressed
-  // output using the highest possible quality, and taking a
-  // longer amount of time to encode. (speed setting ignored)
-  TWO_PASS_SECOND_BEST = 5,
-
-  // Realtime/Live Encoding. This mode is optimized for realtime
-  // encoding (for example, capturing a television signal or feed from
-  // a live camera). (speed setting controls how fast)
-  REALTIME = 6,
+  // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+  // example, capturing a television signal or feed from a live camera). Speed
+  // setting controls how fast.
+  REALTIME
 } MODE;
 
 typedef enum {
@@ -131,10 +114,11 @@
 
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
-  BIT_DEPTH bit_depth;
+  vpx_bit_depth_t bit_depth;     // Codec bit-depth.
   int width;  // width of data passed to the compressor
   int height;  // height of data passed to the compressor
-  double framerate;  // set to passed in framerate
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;  // set to passed in framerate
   int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
@@ -220,39 +204,35 @@
 
   int arnr_max_frames;
   int arnr_strength;
-  int arnr_type;
 
   int tile_columns;
   int tile_rows;
 
-  struct vpx_fixed_buf         two_pass_stats_in;
-  struct vpx_codec_pkt_list  *output_pkt_list;
+  vpx_fixed_buf_t two_pass_stats_in;
+  struct vpx_codec_pkt_list *output_pkt_list;
 
 #if CONFIG_FP_MB_STATS
-  struct vpx_fixed_buf         firstpass_mb_stats_in;
+  vpx_fixed_buf_t firstpass_mb_stats_in;
 #endif
 
   vp8e_tuning tuning;
   vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
-static INLINE int is_best_mode(MODE mode) {
-  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
-}
-
 typedef struct VP9_COMP {
   QUANTS quants;
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
-  struct lookahead_entry  *source;
   struct lookahead_entry  *alt_ref_source;
-  struct lookahead_entry  *last_source;
 
   YV12_BUFFER_CONFIG *Source;
   YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
@@ -261,10 +241,6 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt same as last ( short circuit altref search)
-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
-
   int skippable_frame;
 
   int scaled_ref_idx[3];
@@ -296,18 +272,23 @@
 
   CODING_CONTEXT coding_context;
 
+  int *nmvcosts[2];
+  int *nmvcosts_hp[2];
+  int *nmvsadcosts[2];
+  int *nmvsadcosts_hp[2];
+
   int zbin_mode_boost;
   int zbin_mode_boost_enabled;
-  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
-  int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
 
   RATE_CONTROL rc;
+  double framerate;
 
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
 
   struct vpx_codec_pkt_list  *output_pkt_list;
 
@@ -354,7 +335,7 @@
   TWO_PASS twopass;
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
 
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
@@ -393,10 +374,6 @@
 
   int droppable;
 
-  int dummy_packing;    /* flag to indicate if packing is dummy */
-
-  unsigned int tx_stepdown_count[TX_SIZES];
-
   int initial_width;
   int initial_height;
 
@@ -415,7 +392,7 @@
   search_site_config ss_cfg;
 
   int mbmode_cost[INTRA_MODES];
-  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
@@ -496,14 +473,6 @@
       .buf;
 }
 
-// Intra only frames, golden frames (except alt ref overlays) and
-// alt ref frames tend to be coded at a higher than ambient quality
-static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
-         vp9_is_upper_layer_key_frame(cpi);
-}
-
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
   // TODO(JBB): double check we can't exceed this token count if we have a
   // 32x32 transform crossing a boundary at a multiple of 16.
@@ -521,8 +490,6 @@
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
-int64_t vp9_rescale(int64_t val, int64_t num, int denom);
-
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
@@ -531,16 +498,17 @@
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
   return cpi->use_svc &&
-         cpi->svc.number_temporal_layers == 1 &&
-         cpi->svc.number_spatial_layers > 1;
+         (cpi->svc.number_temporal_layers > 1 ||
+          cpi->svc.number_spatial_layers > 1) &&
+         (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
          (cpi->oxcf.play_alternate &&
-          (!is_spatial_svc(cpi) ||
+          (!is_two_pass_svc(cpi) ||
            cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
 }
 
@@ -557,6 +525,10 @@
   return frame_index & 0x1;
 }
 
+static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 295e437..54b57cf 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -76,16 +76,6 @@
   p->stats_in = position;
 }
 
-static int lookup_next_frame_stats(const TWO_PASS *p,
-                                   FIRSTPASS_STATS *next_frame) {
-  if (p->stats_in >= p->stats_in_end)
-    return EOF;
-
-  *next_frame = *p->stats_in;
-  return 1;
-}
-
-
 // Read frame stats at an offset from the current position.
 static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
   if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
@@ -256,7 +246,7 @@
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
     int i;
     for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
       output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
@@ -396,7 +386,7 @@
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-void vp9_first_pass(VP9_COMP *cpi) {
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
@@ -428,10 +418,12 @@
   int neutral_count = 0;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
-  uint32_t lastmv_as_int = 0;
+  MV lastmv = {0, 0};
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -444,42 +436,52 @@
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex());
 
-  if (is_spatial_svc(cpi)) {
-    MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
-    const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
-    twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
+  if (lc != NULL) {
+    twopass = &lc->twopass;
 
-    if (cpi->common.current_video_frame == 0) {
-      cpi->ref_frame_flags = 0;
+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+        REF_FRAMES) {
+      cpi->gld_fb_idx =
+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
     } else {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-      if (lc->current_video_frame_in_layer == 0)
-        cpi->ref_frame_flags = VP9_GOLD_FLAG;
-      else
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = 0;
     }
 
+    if (lc->current_video_frame_in_layer == 0)
+      cpi->ref_frame_flags = 0;
+
     vp9_scale_references(cpi);
 
     // Use either last frame or alt frame for motion search.
     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      ref_frame = LAST_FRAME;
-    } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      ref_frame = GOLDEN_FRAME;
+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (first_ref_buf == NULL)
+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
     }
 
-    if (scaled_ref_buf != NULL)
-      first_ref_buf = scaled_ref_buf;
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      const int ref_idx =
+          cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
+      const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
+
+      gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
+                 get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    } else {
+      gld_yv12 = NULL;
+    }
 
     recon_y_stride = new_yv12->y_stride;
     recon_uv_stride = new_yv12->uv_stride;
     uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
-    // Disable golden frame for svc first pass for now.
-    gld_yv12 = NULL;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    set_ref_ptrs(cm, xd,
+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
 
     cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
                                         &cpi->scaled_source);
@@ -511,9 +513,7 @@
   vp9_tile_init(&tile, cm, 0, 0);
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    int_mv best_ref_mv;
-
-    best_ref_mv.as_int = 0;
+    MV best_ref_mv = {0, 0};
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
@@ -591,16 +591,16 @@
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
       // Other than for the first frame do a motion search.
-      if (cm->current_video_frame > 0) {
+      if ((lc == NULL && cm->current_video_frame > 0) ||
+          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
         int tmp_err, motion_error, raw_motion_error;
-        int_mv mv, tmp_mv;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = {0, 0} , tmp_mv = {0, 0};
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
         motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                             &xd->plane[0].pre[0]);
-        // Assume 0,0 motion with no mv overhead.
-        mv.as_int = tmp_mv.as_int = 0;
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -613,11 +613,10 @@
                                                 &unscaled_last_source_buf_2d);
 
         // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || is_spatial_svc(cpi)) {
+        if (raw_motion_error > 25 || lc != NULL) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                   &motion_error);
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();
             motion_error = (int)(motion_error * error_weight);
@@ -625,9 +624,9 @@
 
           // If the current best reference mv is not centered on 0,0 then do a
           // 0,0 based search as well.
-          if (best_ref_mv.as_int) {
+          if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
             if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
               vp9_clear_system_state();
               tmp_err = (int)(tmp_err * error_weight);
@@ -635,12 +634,14 @@
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
-              mv.as_int = tmp_mv.as_int;
+              mv = tmp_mv;
             }
           }
 
           // Search in an older reference frame.
-          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+          if (((lc == NULL && cm->current_video_frame > 1) ||
+               (lc != NULL && lc->current_video_frame_in_layer > 1))
+              && gld_yv12 != NULL) {
             // Assume 0,0 motion with no mv overhead.
             int gf_motion_error;
 
@@ -648,7 +649,7 @@
             gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                    &xd->plane[0].pre[0]);
 
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
                                      &gf_motion_error);
             if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
               vp9_clear_system_state();
@@ -679,7 +680,8 @@
         }
 
         // Start by assuming that intra mode is best.
-        best_ref_mv.as_int = 0;
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
 
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
@@ -703,25 +705,25 @@
               this_error < 2 * intrapenalty)
             ++neutral_count;
 
-          mv.as_mv.row *= 8;
-          mv.as_mv.col *= 8;
+          mv.row *= 8;
+          mv.col *= 8;
           this_error = motion_error;
           xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0] = mv;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
           xd->mi[0]->mbmi.tx_size = TX_4X4;
           xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.as_mv.row;
-          sum_mvr_abs += abs(mv.as_mv.row);
-          sum_mvc += mv.as_mv.col;
-          sum_mvc_abs += abs(mv.as_mv.col);
-          sum_mvrs += mv.as_mv.row * mv.as_mv.row;
-          sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
           ++intercount;
 
-          best_ref_mv.as_int = mv.as_int;
+          best_ref_mv = mv;
 
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
@@ -739,7 +741,7 @@
           }
 #endif
 
-          if (mv.as_int) {
+          if (!is_zero_mv(&mv)) {
             ++mvcount;
 
 #if CONFIG_FP_MB_STATS
@@ -770,33 +772,33 @@
 #endif
 
             // Non-zero vector, was it different from the last non zero vector?
-            if (mv.as_int != lastmv_as_int)
+            if (!is_equal_mv(&mv, &lastmv))
               ++new_mv_count;
-            lastmv_as_int = mv.as_int;
+            lastmv = mv;
 
             // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 --sum_in_vectors;
             }
 
             // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 --sum_in_vectors;
             }
           }
@@ -864,7 +866,7 @@
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
+    fps.duration = (double)(source->ts_end - source->ts_start);
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
@@ -895,7 +897,7 @@
 
   vp9_extend_frame_borders(new_yv12);
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     vp9_update_reference_frames(cpi);
   } else {
     // Swap frame pointers so last frame refers to the frame we just compressed.
@@ -904,7 +906,7 @@
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
+  if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
   }
 
@@ -926,7 +928,7 @@
 
   ++cm->current_video_frame;
   if (cpi->use_svc)
-    vp9_inc_frame_in_layer(&cpi->svc);
+    vp9_inc_frame_in_layer(cpi);
 }
 
 static double calc_correction_factor(double err_per_mb,
@@ -964,7 +966,7 @@
                                          BPER_MB_NORMBITS) / num_mbs;
     int q;
     int is_svc_upper_layer = 0;
-    if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0)
+    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
     // Try and pick a max Q that will be high enough to encode the
@@ -992,9 +994,9 @@
 void vp9_init_second_pass(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
-                             (svc->number_temporal_layers == 1);
-  TWO_PASS *const twopass = is_spatial_svc ?
+  const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
+                              (svc->number_temporal_layers > 1);
+  TWO_PASS *const twopass = is_two_pass_svc ?
       &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
   double frame_rate;
   FIRSTPASS_STATS *stats;
@@ -1017,7 +1019,7 @@
   // It is calculated based on the actual durations of all frames from the
   // first pass.
 
-  if (is_spatial_svc) {
+  if (is_two_pass_svc) {
     vp9_update_spatial_layer_framerate(cpi, frame_rate);
     twopass->bits_left = (int64_t)(stats->duration *
         svc->layer_context[svc->spatial_layer_id].target_bandwidth /
@@ -1032,7 +1034,7 @@
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
   // are still boosted appropriately for KF/GF/ARF.
-  if (!is_spatial_svc) {
+  if (!is_two_pass_svc) {
     // We don't know the number of MBs for each layer at this point.
     // So we will do it later.
     twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
@@ -1081,8 +1083,7 @@
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
-                                     const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
   const double sr_ratio = frame->coded_error /
                           DOUBLE_DIVIDE_CHECK(frame->sr_coded_error);
   const double zero_motion_pct = frame->pcnt_inter -
@@ -1095,12 +1096,10 @@
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *twopass,
+static int detect_transition_to_still(const TWO_PASS *twopass,
                                       int frame_interval, int still_interval,
                                       double loop_decay_rate,
                                       double last_decay_rate) {
-  int trans_to_still = 0;
-
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
@@ -1108,26 +1107,22 @@
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    const FIRSTPASS_STATS *position = twopass->stats_in;
-    FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
     for (j = 0; j < still_interval; ++j) {
-      if (EOF == input_stats(twopass, &tmp_next_frame))
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end)
         break;
 
-      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
         break;
     }
 
-    reset_fpf_position(twopass, position);
-
     // Only if it does do we signal a transition to still.
-    if (j == still_interval)
-      trans_to_still = 1;
+    return j == still_interval;
   }
 
-  return trans_to_still;
+  return 0;
 }
 
 // This function detects a flash through the high relative pcnt_second_ref
@@ -1373,7 +1368,8 @@
                                    double group_error, int gf_arf_bits) {
   RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
   int frame_index = 1;
@@ -1386,6 +1382,13 @@
   int mid_boost_bits = 0;
   int mid_frame_idx;
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+  int alt_frame_index = frame_index;
+  int has_temporal_layers = is_two_pass_svc(cpi) &&
+                            cpi->svc.number_temporal_layers > 1;
+
+  // Only encode alt reference frame in temporal base layer.
+  if (has_temporal_layers)
+    alt_frame_index = cpi->svc.number_temporal_layers;
 
   key_frame = cpi->common.frame_type == KEY_FRAME ||
               vp9_is_upper_layer_key_frame(cpi);
@@ -1396,17 +1399,17 @@
   // is also the golden frame.
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
-      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
-      twopass->gf_group.rf_level[0] = INTER_NORMAL;
-      twopass->gf_group.bit_allocation[0] = 0;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = OVERLAY_UPDATE;
+      gf_group->rf_level[0] = INTER_NORMAL;
+      gf_group->bit_allocation[0] = 0;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     } else {
-      twopass->gf_group.update_type[0] = GF_UPDATE;
-      twopass->gf_group.rf_level[0] = GF_ARF_STD;
-      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = GF_UPDATE;
+      gf_group->rf_level[0] = GF_ARF_STD;
+      gf_group->bit_allocation[0] = gf_arf_bits;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     }
 
     // Step over the golden frame / overlay frame
@@ -1421,25 +1424,33 @@
 
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
-    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
-    twopass->gf_group.arf_src_offset[frame_index] =
-      (unsigned char)(rc->baseline_gf_interval - 1);
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-    twopass->gf_group.arf_ref_idx[frame_index] =
+    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+
+    if (has_temporal_layers)
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval -
+                          cpi->svc.number_temporal_layers);
+    else
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval - 1);
+
+    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[alt_frame_index] =
       arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                          rc->source_alt_ref_active];
-    ++frame_index;
+    if (!has_temporal_layers)
+      ++frame_index;
 
     if (cpi->multi_arf_enabled) {
       // Set aside a slot for a level 1 arf.
-      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
-      twopass->gf_group.arf_src_offset[frame_index] =
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
         (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
-      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
       ++frame_index;
     }
   }
@@ -1453,6 +1464,10 @@
     if (EOF == input_stats(twopass, &frame_stats))
       break;
 
+    if (has_temporal_layers && frame_index == alt_frame_index) {
+      ++frame_index;
+    }
+
     modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
 
     if (group_error > 0)
@@ -1469,16 +1484,16 @@
       if (frame_index <= mid_frame_idx)
         arf_idx = 1;
     }
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 
     target_frame_size = clamp(target_frame_size, 0,
                               MIN(max_bits, (int)total_group_bits));
 
-    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = LF_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
-    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    gf_group->bit_allocation[frame_index] = target_frame_size;
     ++frame_index;
   }
 
@@ -1486,23 +1501,23 @@
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
   // vp9_rc_get_second_pass_params() the data will be undefined.
-  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
-      twopass->gf_group.bit_allocation[2] =
-        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
-      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
     }
   } else {
-    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
 
   // Note whether multi-arf was enabled this group for next time.
@@ -1554,8 +1569,6 @@
   vp9_clear_system_state();
   vp9_zero(next_frame);
 
-  gf_group_bits = 0;
-
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
 
@@ -1615,9 +1628,8 @@
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
-      zero_motion_accumulator =
-        MIN(zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->common, &next_frame));
+      zero_motion_accumulator = MIN(zero_motion_accumulator,
+                                    get_zero_motion_factor(&next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -1677,6 +1689,21 @@
   else
     rc->baseline_gf_interval = i;
 
+  // Only encode alt reference frame in temporal base layer. So
+  // baseline_gf_interval should be multiple of a temporal layer group
+  // (typically the frame distance between two base layer frames)
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
+    int j;
+    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    }
+    rc->baseline_gf_interval = new_gf_interval;
+  }
+
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
   // Should we use the alternate reference frame.
@@ -1831,6 +1858,7 @@
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const FIRSTPASS_STATS first_frame = *this_frame;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
@@ -1849,7 +1877,7 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // Reset the GF group data structures.
-  vp9_zero(twopass->gf_group);
+  vp9_zero(*gf_group);
 
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -1881,16 +1909,17 @@
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key &&
-        lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
       double loop_decay_rate;
 
       // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
         break;
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+                                                  twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -1947,6 +1976,18 @@
     rc->next_key_frame_forced = 0;
   }
 
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
+    int j;
+    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    }
+    rc->frames_to_key = new_frame_to_key;
+  }
+
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
@@ -1987,9 +2028,8 @@
       break;
 
     // Monitor for static sections.
-    zero_motion_accumulator =
-      MIN(zero_motion_accumulator,
-          get_zero_motion_factor(&cpi->common, &next_frame));
+    zero_motion_accumulator =MIN(zero_motion_accumulator,
+                                 get_zero_motion_factor(&next_frame));
 
     // For the first few frames collect data to decide kf boost.
     if (i <= (rc->max_gf_interval * 2)) {
@@ -2040,9 +2080,9 @@
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
-  twopass->gf_group.bit_allocation[0] = kf_bits;
-  twopass->gf_group.update_type[0] = KF_UPDATE;
-  twopass->gf_group.rf_level[0] = KF_STD;
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2106,7 +2146,7 @@
       assert(0);
       break;
   }
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
     if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
       cpi->refresh_golden_frame = 0;
     if (cpi->alt_ref_source == NULL)
@@ -2119,15 +2159,16 @@
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   int frames_left;
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
   int target_rate;
-  LAYER_CONTEXT *lc = NULL;
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (is_spatial_svc(cpi)) {
-    lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+  if (lc != NULL) {
     frames_left = (int)(twopass->total_stats.count -
                   lc->current_video_frame_in_layer);
   } else {
@@ -2140,10 +2181,10 @@
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
     configure_buffer_updates(cpi);
-    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+    target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
@@ -2154,7 +2195,7 @@
     vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
-    if (is_spatial_svc(cpi)) {
+    if (lc != NULL) {
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
@@ -2170,7 +2211,7 @@
 
   vp9_clear_system_state();
 
-  if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) {
+  if (lc != NULL && twopass->kf_intra_err_min == 0) {
     twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
     twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
   }
@@ -2178,8 +2219,7 @@
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
-             (is_spatial_svc(cpi) &&
-              lc->current_video_frame_in_layer == 0)) {
+             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2205,18 +2245,21 @@
     cm->frame_type = INTER_FRAME;
   }
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     if (cpi->svc.spatial_layer_id == 0) {
       lc->is_key_frame = (cm->frame_type == KEY_FRAME);
-      if (lc->is_key_frame)
+      if (lc->is_key_frame) {
         cpi->ref_frame_flags &=
             (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+        lc->frames_from_key_frame = 0;
+      }
     } else {
       cm->frame_type = INTER_FRAME;
       lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
 
       if (lc->is_key_frame) {
         cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+        lc->frames_from_key_frame = 0;
       }
     }
   }
@@ -2236,13 +2279,13 @@
     }
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (!is_spatial_svc(cpi))
+    if (lc != NULL)
       cpi->refresh_golden_frame = 1;
   }
 
   configure_buffer_updates(cpi);
 
-  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+  target_rate = gf_group->bit_allocation[gf_group->index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index bf8c9fd..aaa6b03 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -121,7 +121,7 @@
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
-void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index e743517..823e7a1 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -50,6 +50,9 @@
                                          unsigned int height,
                                          unsigned int subsampling_x,
                                          unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
                                          unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
@@ -70,6 +73,9 @@
     for (i = 0; i < depth; i++)
       if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
                                  width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
                                  VP9_ENC_BORDER_IN_PIXELS))
         goto bail;
   }
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 678c51a..2786193 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -56,6 +56,9 @@
                                          unsigned int height,
                                          unsigned int subsampling_x,
                                          unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
                                          unsigned int depth);
 
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 6e04e2a..b8e7164 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -34,6 +34,7 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   MV ref_full;
+  int sad_list[5];
 
   // Further step/diamond searches as necessary
   int step_param = mv_sf->reduce_first_step_size;
@@ -45,8 +46,9 @@
   ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
-                 ref_mv, dst_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_sad_list(cpi, sad_list),
+                 &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -55,8 +57,10 @@
     unsigned int sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
-        &sse, NULL, 0, 0);
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_sad_list(cpi, sad_list),
+        NULL, NULL,
+        &distortion, &sse, NULL, 0, 0);
   }
 
   xd->mi[0]->mbmi.mode = NEWMV;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ae924d5..d6f6b25 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -256,6 +256,137 @@
     }                                                   \
   }
 
+#define SETUP_SUBPEL_SEARCH                                                \
+  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const int src_stride = x->plane[0].src.stride;                           \
+  const MACROBLOCKD *xd = &x->e_mbd;                                       \
+  unsigned int besterr = INT_MAX;                                          \
+  unsigned int sse;                                                        \
+  unsigned int whichdir;                                                   \
+  int thismse;                                                             \
+  const unsigned int halfiters = iters_per_step;                           \
+  const unsigned int quarteriters = iters_per_step;                        \
+  const unsigned int eighthiters = iters_per_step;                         \
+  const int y_stride = xd->plane[0].pre[0].stride;                         \
+  const int offset = bestmv->row * y_stride + bestmv->col;                 \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
+                                                                           \
+  int rr = ref_mv->row;                                                    \
+  int rc = ref_mv->col;                                                    \
+  int br = bestmv->row * 8;                                                \
+  int bc = bestmv->col * 8;                                                \
+  int hstep = 4;                                                           \
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
+  int tr = br;                                                             \
+  int tc = bc;                                                             \
+                                                                           \
+  bestmv->row *= 8;                                                        \
+  bestmv->col *= 8;                                                        \
+  if (second_pred != NULL) {                                               \
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
+  } else {                                                                 \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);          \
+  }                                                                        \
+  *distortion = besterr;                                                   \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *sad_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) +
+               (sad_list[2] < sad_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -263,55 +394,14 @@
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
                                  int iters_per_step,
+                                 int *sad_list,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  int thismse;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  // TODO(yunqingwang): central pointer error was already calculated in full-
-  // pixel search, and can be passed in this function.
-  if (second_pred != NULL) {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  SETUP_SUBPEL_SEARCH;
+  (void) sad_list;  // to silence compiler warning
 
   // Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
@@ -398,14 +488,17 @@
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
+//
 static int vp9_pattern_search(const MACROBLOCK *x,
                               MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
-                              int do_init_search, int do_refine,
+                              int do_init_search,
+                              int *sad_list,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
-                              const MV *center_mv, MV *best_mv,
+                              const MV *center_mv,
+                              MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
@@ -413,7 +506,7 @@
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
-  int i, j, s, t;
+  int i, s, t;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   int br, bc;
@@ -552,47 +645,38 @@
     } while (s--);
   }
 
-  // Check 4 1-away neighbors if do_refine is true.
-  // For most well-designed schemes do_refine will not be necessary.
-  if (do_refine) {
-    static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-
-    for (j = 0; j < 16; j++) {
-      int best_site = -1;
-      if (check_bounds(x, br, bc, 1)) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
-      } else {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-                              bc + neighbors[i].col};
-          if (!is_mv_in(x, &this_mv))
-            continue;
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
+  // Returns the one-away integer pel sad values around the best as follows:
+  // sad_list[0]: sad at the best integer pel
+  // sad_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // sad_list[2]: sad at delta {-1, 0} (top)    from the best integer pel
+  // sad_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel
+  if (sad_list) {
+    static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}};
+    sad_list[0] = bestsad;
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+                            bc + neighbors[i].col};
+        sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
       }
-
-      if (best_site == -1) {
-        break;
-      } else {
-        br += neighbors[best_site].row;
-        bc += neighbors[best_site].col;
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+                            bc + neighbors[i].col};
+        if (!is_mv_in(x, &this_mv))
+          sad_list[i + 1] = INT_MAX;
+        else
+          sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride);
       }
     }
   }
-
   best_mv->row = br;
   best_mv->col = bc;
-
   return bestsad;
 }
 
@@ -634,6 +718,7 @@
                    int search_param,
                    int sad_per_bit,
                    int do_init_search,
+                   int *sad_list,
                    const vp9_variance_fn_ptr_t *vfp,
                    int use_mvcost,
                    const MV *center_mv, MV *best_mv) {
@@ -658,7 +743,7 @@
       { -1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             hex_num_candidates, hex_candidates);
 }
@@ -668,6 +753,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *sad_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -699,7 +785,7 @@
       {-512, 512}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             bigdia_num_candidates, bigdia_candidates);
 }
@@ -709,6 +795,7 @@
                       int search_param,
                       int sad_per_bit,
                       int do_init_search,
+                      int *sad_list,
                       const vp9_variance_fn_ptr_t *vfp,
                       int use_mvcost,
                       const MV *center_mv,
@@ -740,7 +827,7 @@
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
   return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, 0, vfp, use_mvcost,
+                            do_init_search, sad_list, vfp, use_mvcost,
                             center_mv, best_mv,
                             square_num_candidates, square_candidates);
 }
@@ -750,12 +837,13 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,  // must be zero for fast_hex
+                        int *sad_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                        sad_per_bit, do_init_search, vfp, use_mvcost,
+                        sad_per_bit, do_init_search, sad_list, vfp, use_mvcost,
                         center_mv, best_mv);
 }
 
@@ -764,13 +852,14 @@
                         int search_param,
                         int sad_per_bit,
                         int do_init_search,
+                        int *sad_list,
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
   return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
-                           sad_per_bit, do_init_search, vfp, use_mvcost,
-                           center_mv, best_mv);
+                           sad_per_bit, do_init_search, sad_list, vfp,
+                           use_mvcost, center_mv, best_mv);
 }
 
 #undef CHECK_BETTER
@@ -1368,33 +1457,41 @@
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *sad_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  if (sad_list) {
+    sad_list[0] = INT_MAX;
+    sad_list[1] = INT_MAX;
+    sad_list[2] = INT_MAX;
+    sad_list[3] = INT_MAX;
+    sad_list[4] = INT_MAX;
+  }
 
   switch (method) {
     case FAST_DIAMOND:
       var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case FAST_HEX:
       var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                fn_ptr, 1, ref_mv, tmp_mv);
+                                sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case HEX:
       var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           fn_ptr, 1, ref_mv, tmp_mv);
+                           sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case SQUARE:
       var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case BIGDIA:
       var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              fn_ptr, 1, ref_mv, tmp_mv);
+                              sad_list, fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
       var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 298fbb6..9b4734a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,6 +79,7 @@
     int search_param,
     int error_per_bit,
     int do_init_search,
+    int *sad_list,
     const vp9_variance_fn_ptr_t *vf,
     int use_mvcost,
     const MV *center_mv,
@@ -98,12 +99,14 @@
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
+    int *sad_list,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
@@ -136,8 +139,10 @@
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
+                          int *sad_list,
                           const MV *ref_mv, MV *tmp_mv,
                           int var_max, int rd);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index d365489..5557d7f 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -77,7 +77,6 @@
   while (filter_step > 0) {
     const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
     const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
-    int filt_err;
 
     // Bias against raising loop filter in favor of lowering it.
     int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -92,17 +91,14 @@
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
-        ss_err[filt_low] = filt_err;
-      } else {
-        filt_err = ss_err[filt_low];
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((filt_err - bias) < best_err) {
+      if ((ss_err[filt_low] - bias) < best_err) {
         // Was it actually better than the previous best?
-        if (filt_err < best_err)
-          best_err = filt_err;
+        if (ss_err[filt_low] < best_err)
+          best_err = ss_err[filt_low];
 
         filt_best = filt_low;
       }
@@ -111,14 +107,11 @@
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
-        ss_err[filt_high] = filt_err;
-      } else {
-        filt_err = ss_err[filt_high];
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
       }
       // Was it better than the previous best?
-      if (filt_err < (best_err - bias)) {
-        best_err = filt_err;
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
         filt_best = filt_high;
       }
     }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 6115f5a..a97d778 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -17,6 +17,7 @@
 
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -27,11 +28,17 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
 static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                       const TileInfo *const tile,
-                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                       int_mv *mv_ref_list,
-                       int mi_row, int mi_col) {
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
 
@@ -125,6 +132,7 @@
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
   int rv = 0;
+  int sad_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
   if (cpi->common.show_frame &&
@@ -151,8 +159,9 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
-                        &tmp_mv->as_mv, INT_MAX, 0);
+  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                        cond_sad_list(cpi, sad_list),
+                        &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -178,6 +187,7 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_sad_list(cpi, sad_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
     x->pred_mv[ref] = tmp_mv->as_mv;
@@ -220,13 +230,15 @@
 
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+      xd->mi[0]->mbmi.tx_size =
+          MIN(max_txsize_lookup[bsize],
+              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
       xd->mi[0]->mbmi.tx_size = TX_8X8;
   } else {
-    xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
-                         tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    xd->mi[0]->mbmi.tx_size =
+        MIN(max_txsize_lookup[bsize],
+            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
   vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
@@ -253,7 +265,8 @@
 }
 
 static void free_pred_buffer(PRED_BUFFER *p) {
-  p->in_use = 0;
+  if (p != NULL)
+    p->in_use = 0;
 }
 
 static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
@@ -326,7 +339,7 @@
 
         // The cost of skip bit needs to be added.
         *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                     [INTER_OFFSET(this_mode)];
+                                    [INTER_OFFSET(this_mode)];
 
         // More on this part of rate
         // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -343,7 +356,53 @@
   }
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+struct estimate_block_intra_args {
+  VP9_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int rate;
+  int64_t dist;
+};
+
+static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                                 TX_SIZE tx_size, void *arg) {
+  struct estimate_block_intra_args* const args = arg;
+  VP9_COMP *const cpi = args->cpi;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  int rate;
+  int64_t dist;
+  unsigned int var_y, sse_y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  assert(plane == 0);
+  (void) plane;
+
+  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+  // Use source buffer as an approximation for the fully reconstructed buffer.
+  vp9_predict_intra_block(xd, block >> (2 * tx_size),
+                          b_width_log2(plane_bsize),
+                          tx_size, args->mode,
+                          p->src.buf, src_stride,
+                          pd->dst.buf, dst_stride,
+                          i, j, 0);
+  // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
+  model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rate += rate;
+  args->dist += dist;
+}
+
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
   {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
@@ -351,21 +410,21 @@
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            const TileInfo *const tile,
-                            int mi_row, int mi_col,
-                            int *returnrate,
-                            int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[0];
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         const TileInfo *const tile,
+                         int mi_row, int mi_col,
+                         int *returnrate,
+                         int64_t *returndistortion,
+                         BLOCK_SIZE bsize,
+                         PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblockd_plane *const pd = &xd->plane[0];
-  PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
   TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
-                             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+                             tx_mode_to_biggest_tx_size[cm->tx_mode]);
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -373,33 +432,30 @@
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd = INT64_MAX;
-  int skip_txfm = 0;
+  uint8_t skip_txfm = 0;
   int rate = INT_MAX;
   int64_t dist = INT64_MAX;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
 
-  VP9_COMMON *cm = &cpi->common;
-  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
-
+  const int intra_cost_penalty =
+      20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
-  const int64_t intra_mode_cost = 50;
+  const int intra_mode_cost = 50;
 
-  unsigned char segment_id = mbmi->segment_id;
+  const int8_t segment_id = mbmi->segment_id;
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
-  // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
   INTERP_FILTER filter_ref = cm->interp_filter;
-  int bsl = mi_width_log2(bsize);
+  const int bsl = mi_width_log2(bsize);
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
   int const_motion[MAX_REF_FRAMES] = { 0 };
-  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
-  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  int pixels_in_block = bh * bw;
+  const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
   // For speed 6, the result of interp filter is reused later in actual encoding
   // process.
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
@@ -408,15 +464,11 @@
   struct buf_2d orig_dst = pd->dst;
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
-  int i;
 
-  // CTX is used by the temporal denoiser which is currently being developed.
-  // TODO(jbb): when temporal denoiser is finished and in the default build
-  // remove the following line;
-  (void) ctx;
   if (cpi->sf.reuse_inter_pred_sby) {
+    int i;
     for (i = 0; i < 3; i++) {
-      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].data = &pred_buf[bw * bh * i];
       tmp[i].stride = bw;
       tmp[i].in_use = 0;
     }
@@ -442,6 +494,7 @@
   mbmi->segment_id = segment_id;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    PREDICTION_MODE this_mode;
     x->pred_mv_sad[ref_frame] = INT_MAX;
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -496,8 +549,9 @@
       if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
-      mode_rd_thresh =  rd_threshes[mode_idx[ref_frame - LAST_FRAME]
-                                            [this_mode - NEARESTMV]];
+      mode_rd_thresh =
+          rd_threshes[mode_idx[ref_frame -
+                               LAST_FRAME][INTER_OFFSET(this_mode)]];
       if (rd_less_than_thresh(best_rd, mode_rd_thresh,
                               rd_thresh_freq_fact[this_mode]))
         continue;
@@ -514,7 +568,7 @@
       if (this_mode != NEARESTMV &&
           frame_mv[this_mode][ref_frame].as_int ==
               frame_mv[NEARESTMV][ref_frame].as_int)
-          continue;
+        continue;
 
       mbmi->mode = this_mode;
       mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
@@ -593,7 +647,7 @@
 
       rate += rate_mv;
       rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                [INTER_OFFSET(this_mode)];
+                                  [INTER_OFFSET(this_mode)];
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
       // Skipping checking: test to see if this block can be reconstructed by
@@ -609,9 +663,10 @@
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
       if (cpi->oxcf.noise_sensitivity > 0) {
-        vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
-                                        this_mode, ctx);
+        vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
       }
+#else
+      (void)ctx;
 #endif
 
       if (this_rd < best_rd || x->skip) {
@@ -625,8 +680,7 @@
         skip_txfm = x->skip_txfm[0];
 
         if (cpi->sf.reuse_inter_pred_sby) {
-          if (best_pred != NULL)
-            free_pred_buffer(best_pred);
+          free_pred_buffer(best_pred);
 
           best_pred = this_mode_pred;
         }
@@ -646,16 +700,11 @@
 
   // If best prediction is not in dst buf, then copy the prediction block from
   // temp buf to dst buf.
-  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
-    uint8_t *copy_from, *copy_to;
-
+  if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
+      best_pred->data != orig_dst.buf) {
     pd->dst = orig_dst;
-    copy_to = pd->dst.buf;
-
-    copy_from = best_pred->data;
-
-    vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
-                      bw, bh);
+    vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
+                      NULL, 0, bw, bh);
   }
 
   mbmi->mode          = best_mode;
@@ -670,20 +719,11 @@
   // threshold.
   if (!x->skip && best_rd > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
-    int i, j;
-    const int width  = num_4x4_blocks_wide_lookup[bsize];
-    const int height = num_4x4_blocks_high_lookup[bsize];
-
-    int rate2 = 0;
-    int64_t dist2 = 0;
-    const int dst_stride = cpi->sf.reuse_inter_pred_sby ? bw : pd->dst.stride;
-    const int src_stride = p->src.stride;
-    int block_idx = 0;
-
-    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
-                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
-    const int step = 1 << tmp_tx_size;
+    PREDICTION_MODE this_mode;
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+    const TX_SIZE intra_tx_size =
+        MIN(max_txsize_lookup[bsize],
+            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
 
     if (cpi->sf.reuse_inter_pred_sby) {
       pd->dst.buf = tmp[0].data;
@@ -691,44 +731,26 @@
     }
 
     for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      uint8_t *const src_buf_base = p->src.buf;
-      uint8_t *const dst_buf_base = pd->dst.buf;
-      for (j = 0; j < height; j += step) {
-        for (i = 0; i < width; i += step) {
-          p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
-          pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
-          // Use source buffer as an approximation for the fully reconstructed
-          // buffer
-          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
-                                  tmp_tx_size, this_mode,
-                                  p->src.buf, src_stride,
-                                  pd->dst.buf, dst_stride,
-                                  i, j, 0);
-          model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
-          rate2 += rate;
-          dist2 += dist;
-          ++block_idx;
-        }
-      }
-      p->src.buf = src_buf_base;
-      pd->dst.buf = dst_buf_base;
-
-      rate = rate2;
-      dist = dist2;
-
+      const TX_SIZE saved_tx_size = mbmi->tx_size;
+      args.mode = this_mode;
+      args.rate = 0;
+      args.dist = 0;
+      mbmi->tx_size = intra_tx_size;
+      vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                             estimate_block_intra, &args);
+      mbmi->tx_size = saved_tx_size;
+      rate = args.rate;
+      dist = args.dist;
       rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
-      if (cpi->sf.reuse_inter_pred_sby)
-        pd->dst = orig_dst;
-
       if (this_rd + intra_mode_cost < best_rd) {
         best_rd = this_rd;
         *returnrate = rate;
         *returndistortion = dist;
         mbmi->mode = this_mode;
-        mbmi->tx_size = tmp_tx_size;
+        mbmi->tx_size = intra_tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
@@ -736,7 +758,7 @@
         x->skip_txfm[0] = skip_txfm;
       }
     }
+    if (cpi->sf.reuse_inter_pred_sby)
+      pd->dst = orig_dst;
   }
-
-  return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 49c6feb..97aeca7 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -17,19 +17,13 @@
 extern "C" {
 #endif
 
-typedef struct {
-  uint8_t *data;
-  int stride;
-  int in_use;
-} PRED_BUFFER;
-
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            const struct TileInfo *const tile,
-                            int mi_row, int mi_col,
-                            int *returnrate,
-                            int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx);
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         const struct TileInfo *const tile,
+                         int mi_row, int mi_col,
+                         int *returnrate,
+                         int64_t *returndistortion,
+                         BLOCK_SIZE bsize,
+                         PICK_MODE_CONTEXT *ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index eababdb..d49eb95 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,9 +19,9 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
@@ -40,9 +40,9 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
@@ -62,11 +62,11 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr,
                        int zbin_oq_value, uint16_t *eob_ptr,
                        const int16_t *scan, const int16_t *iscan) {
@@ -78,13 +78,13 @@
   (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
@@ -105,12 +105,12 @@
 
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block,
                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                              const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
-                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr,
                              int zbin_oq_value, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
@@ -120,8 +120,8 @@
   (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
@@ -146,27 +146,27 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)count, eob = -1;
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
   const int nzbins[2] = { zbins[0] * -1,
                           zbins[1] * -1 };
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = (int)count - 1; i >= 0; i--) {
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
 
@@ -199,12 +199,12 @@
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             const int16_t *zbin_ptr, const int16_t *round_ptr,
                             const int16_t *quant_ptr,
                             const int16_t *quant_shift_ptr,
-                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr,
                             int zbin_oq_value, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
@@ -217,8 +217,8 @@
   int i, eob = -1;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
@@ -280,13 +280,19 @@
   *shift = 1 << (16 - l);
 }
 
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+  int quant = vp9_dc_quant(q, 0);
+  (void) bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+}
+
 void vp9_init_quantizer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 262529b..d7edb0b 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,17 +37,29 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant_ptr,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+                          const int16_t *round_ptr, const int16_t quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                const int16_t *round_ptr,
+                                const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif
+
 struct VP9_COMP;
 struct VP9Common;
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 290567e..b607c85 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -646,7 +646,6 @@
   int q;
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = rc->best_quality;
 
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
@@ -1208,7 +1207,7 @@
       ? INT_MAX : (int)(rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
-    double framerate = oxcf->framerate;
+    double framerate = cpi->framerate;
     if (svc->number_temporal_layers > 1 &&
         oxcf->rc_mode == VPX_CBR) {
       // Use the layer framerate for temporal layers CBR mode.
@@ -1236,7 +1235,7 @@
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
 
-    if (is_spatial_svc(cpi)) {
+    if (is_two_pass_svc(cpi)) {
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
       cpi->ref_frame_flags &=
           (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
@@ -1248,7 +1247,7 @@
   } else {
     cm->frame_type = INTER_FRAME;
 
-    if (is_spatial_svc(cpi)) {
+    if (is_two_pass_svc(cpi)) {
       LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
@@ -1364,7 +1363,7 @@
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate);
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
                                 oxcf->two_pass_vbrmin_section / 100);
 
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 4fc3e9e..b826ff4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -364,20 +364,16 @@
                  int ref_frame, BLOCK_SIZE block_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int_mv this_mv;
   int i;
   int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
-
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
-  int row_offset, col_offset;
-  int num_mv_refs = MAX_MV_REF_CANDIDATES +
+  const int num_mv_refs = MAX_MV_REF_CANDIDATES +
                     (cpi->sf.adaptive_motion_search &&
-                     cpi->common.show_frame &&
                      block_size < cpi->sf.max_partition_size);
 
   MV pred_mv[3];
@@ -387,19 +383,16 @@
 
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
-    this_mv.as_mv = pred_mv[i];
+    const MV *this_mv = &pred_mv[i];
 
-    max_mv = MAX(max_mv,
-                 MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
-    // Only need to check zero mv once.
-    if (!this_mv.as_int && zero_seen)
+    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+    if (is_zero_mv(this_mv) && zero_seen)
       continue;
 
-    zero_seen = zero_seen || !this_mv.as_int;
+    zero_seen |= is_zero_mv(this_mv);
 
-    row_offset = this_mv.as_mv.row >> 3;
-    col_offset = this_mv.as_mv.col >> 3;
-    ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
+    ref_y_ptr =
+        &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
@@ -462,7 +455,7 @@
 
   // Set baseline threshold values.
   for (i = 0; i < MAX_MODES; ++i)
-    rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
   rd->thresh_mult[THR_NEARESTMV] = 0;
   rd->thresh_mult[THR_NEARESTG] = 0;
@@ -548,7 +541,7 @@
   int i;
 
   for (i = 0; i < MAX_REFS; ++i)
-    rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode)  ? -500 : 0;
+    rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
   rd->thresh_mult_sub8x8[THR_LAST] += 2500;
   rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cc55dd7..9df85de 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -41,9 +41,14 @@
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC      1
 
-#define LAST_FRAME_MODE_MASK    0xFFEDCD60
-#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
-#define ALT_REF_MODE_MASK       0xFFC648D0
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
 
 #define MIN_EARLY_TERM_INDEX    3
 
@@ -171,30 +176,53 @@
   int64_t dist_sum = 0;
   const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
+  unsigned int var = 0;
+  unsigned int sum_sse = 0;
   const int shift = 8;
+  int rate;
+  int64_t dist;
+
+  x->pred_sse[ref] = 0;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int idx, idy;
+    int lw = b_width_log2_lookup[unit_size] + 2;
+    int lh = b_height_log2_lookup[unit_size] + 2;
 
-    const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                                                pd->dst.buf, pd->dst.stride,
-                                                &sse);
+    sum_sse = 0;
 
-    if (!x->select_tx_size) {
-      if (sse < p->quant_thred[0] >> shift)
-        x->skip_txfm[i] = 1;
-      else if (var < p->quant_thred[1] >> shift)
-        x->skip_txfm[i] = 2;
-      else
-        x->skip_txfm[i] = 0;
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+        int block_idx = (idy << 1) + idx;
+
+        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+                                        dst, pd->dst.stride, &sse);
+        x->bsse[(i << 2) + block_idx] = sse;
+        sum_sse += sse;
+
+        if (!x->select_tx_size) {
+          if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift)
+            x->skip_txfm[(i << 2) + block_idx] = 1;
+          else if (var < p->quant_thred[1] >> shift)
+            x->skip_txfm[(i << 2) + block_idx] = 2;
+          else
+            x->skip_txfm[(i << 2) + block_idx] = 0;
+        }
+
+        if (i == 0)
+          x->pred_sse[ref] += sse;
+      }
     }
 
-    x->bsse[i] = sse;
-    if (i == 0)
-      x->pred_sse[ref] = sse;
-
     // Fast approximate the modelling function.
     if (cpi->oxcf.speed > 4) {
       int64_t rate;
@@ -210,9 +238,7 @@
       rate_sum += rate;
       dist_sum += dist;
     } else {
-      int rate;
-      int64_t dist;
-      vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+      vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
@@ -223,7 +249,7 @@
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
@@ -262,7 +288,7 @@
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
-  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t token_cache[32 * 32];
@@ -332,8 +358,8 @@
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
   int shift = tx_size == TX_32X32 ? 0 : 2;
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                &this_sse) >> shift;
   args->sse  = this_sse >> shift;
@@ -372,17 +398,17 @@
   if (!is_inter_block(mbmi)) {
     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
     dist_block(plane, block, tx_size, args);
-  } else {
-    if (x->skip_txfm[plane] == 0) {
+  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
       dist_block(plane, block, tx_size, args);
-    } else if (x->skip_txfm[plane] == 2) {
+    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
       // compute DC coefficient
-      int16_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
-      int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-      args->sse  = x->bsse[plane] << 4;
+      args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       args->dist = args->sse;
       if (!x->plane[plane].eobs[block])
         args->dist = args->sse - ((coeff[0] * coeff[0] -
@@ -390,9 +416,13 @@
     } else {
       // skip forward transform
       x->plane[plane].eobs[block] = 0;
-      args->sse  = x->bsse[plane] << 4;
+      args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       args->dist = args->sse;
     }
+  } else {
+    // full forward transform and quantization
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    dist_block(plane, block, tx_size, args);
   }
 
   rate_block(plane, block, plane_bsize, tx_size, args);
@@ -468,7 +498,6 @@
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-  cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -490,24 +519,24 @@
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX}};
-  TX_SIZE n, m;
+  int n, m;
   int s0, s1;
   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   int64_t best_rd = INT64_MAX;
-  TX_SIZE best_tx = TX_4X4;
+  TX_SIZE best_tx = max_tx_size;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_tx_size; n++) {
+  for (n = max_tx_size; n >= 0;  n--) {
     txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
                      &sse[n], ref_best_rd, 0, bs, n,
                      cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
-      for (m = 0; m <= n - (n == max_tx_size); m++) {
+      for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
         if (m == n)
           r[n][1] += vp9_cost_zero(tx_probs[m]);
         else
@@ -523,6 +552,13 @@
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd[n][1] == INT64_MAX ||
+        (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
+        s[n] == 1))
+      break;
+
     if (rd[n][1] < best_rd) {
       best_tx = n;
       best_rd = rd[n][1];
@@ -544,60 +580,36 @@
 
   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-    cpi->tx_stepdown_count[0]++;
   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
-static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                  int64_t *distortion, int *skip,
-                                  int64_t *psse, BLOCK_SIZE bs,
-                                  int64_t txfm_cache[TX_MODES],
-                                  int64_t ref_best_rd) {
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int64_t *distortion, int *skip,
+                            int64_t *psse, BLOCK_SIZE bs,
+                            int64_t txfm_cache[TX_MODES],
+                            int64_t ref_best_rd) {
   MACROBLOCKD *xd = &x->e_mbd;
+  int64_t sse;
+  int64_t *ret_sse = psse ? psse : &sse;
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
 
-  vp9_subtract_plane(x, bs, 0);
-
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
                            txfm_cache, ref_best_rd, bs);
   }
 }
 
-static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                  int64_t *distortion, int *skip,
-                                  BLOCK_SIZE bs,
-                                  int64_t txfm_cache[TX_MODES],
-                                  int64_t ref_best_rd) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t sse;
-
-  assert(bs == xd->mi[0]->mbmi.sb_type);
-  if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
-    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
-                           bs);
-  } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
-                           txfm_cache, ref_best_rd, bs);
-  }
-}
-
-
 static int conditional_skipintra(PREDICTION_MODE mode,
                                  PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
@@ -678,7 +690,7 @@
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
                                                             p->src_diff);
-        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
@@ -847,8 +859,8 @@
     }
     mic->mbmi.mode = mode;
 
-    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, bsize, local_tx_cache, best_rd);
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1125,7 +1137,7 @@
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
-      int16_t* coeff;
+      tran_low_t* coeff;
 
       k += (idy * 2 + idx);
       coeff = BLOCK_OFFSET(p->coeff, k);
@@ -1217,11 +1229,9 @@
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
-    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int inter_mode_mask, int this_mode,
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if ((inter_mode_mask & (1 << ZEROMV)) &&
-      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
@@ -1339,7 +1349,6 @@
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                inter_mode_mask,
                                 this_mode, mbmi->ref_frame))
           continue;
 
@@ -1358,13 +1367,14 @@
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
+          int sad_list[5];
 
           /* Is the best so far sufficiently good that we cant justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh)
             break;
 
-          if (!is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1390,7 +1400,7 @@
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+          if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
             step_param = MAX(step_param, 8);
@@ -1401,12 +1411,14 @@
 
           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
-          bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                          sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
-                                          INT_MAX, 1);
+          bestsme = vp9_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL,
+              &bsi->ref_mv[0]->as_mv, new_mv,
+              INT_MAX, 1);
 
           // Should we do a full search (best quality only)
-          if (is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode == BEST) {
             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
@@ -1415,6 +1427,7 @@
                                            sadpb, 16, &cpi->fn_ptr[bsize],
                                            &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
+            sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
             if (thissme < bestsme) {
               bestsme = thissme;
               *new_mv = best_mv->as_mv;
@@ -1427,17 +1440,19 @@
 
           if (bestsme < INT_MAX) {
             int distortion;
-            cpi->find_fractional_mv_step(x,
-                                         new_mv,
-                                         &bsi->ref_mv[0]->as_mv,
-                                         cm->allow_high_precision_mv,
-                                         x->errorperbit, &cpi->fn_ptr[bsize],
-                                         cpi->sf.mv.subpel_force_stop,
-                                         cpi->sf.mv.subpel_iters_per_step,
-                                         x->nmvjointcost, x->mvcost,
-                                         &distortion,
-                                         &x->pred_sse[mbmi->ref_frame[0]],
-                                         NULL, 0, 0);
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_sad_list(cpi, sad_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, 0, 0);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1694,12 +1709,14 @@
                          int mode_index,
                          int64_t comp_pred_diff[REFERENCE_MODES],
                          const int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+                         int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
   ctx->skip = x->skip;
+  ctx->skippable = skippable;
   ctx->best_mode_index = mode_index;
   ctx->mic = *xd->mi[0];
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
@@ -1765,6 +1782,7 @@
   int tmp_col_max = x->mv_col_max;
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
+  int sad_list[5];
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
@@ -1799,8 +1817,7 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
-      cm->show_frame) {
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
                                                        b_width_log2(bsize)));
     step_param = MAX(step_param, boffset);
@@ -1837,6 +1854,7 @@
   mvp_full.row >>= 3;
 
   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_sad_list(cpi, sad_list),
                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
@@ -1852,13 +1870,14 @@
                                  &cpi->fn_ptr[bsize],
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
+                                 cond_sad_list(cpi, sad_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+  if (cpi->sf.adaptive_motion_search)
     x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
@@ -1976,6 +1995,7 @@
           x->errorperbit,
           &cpi->fn_ptr[bsize],
           0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
           x->nmvjointcost, x->mvcost,
           &dis, &sse, second_pred,
           pw, ph);
@@ -2111,6 +2131,8 @@
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
+                                 INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+                                 int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
                                  const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
@@ -2128,14 +2150,14 @@
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
-  int skip_txfm[MAX_MB_PLANE] = {0};
-  int64_t bsse[MAX_MB_PLANE] = {0};
+  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+  int64_t bsse[MAX_MB_PLANE << 2] = {0};
 
   int bsl = mi_width_log2_lookup[bsize];
   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
@@ -2157,6 +2179,12 @@
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
+
+    if (cpi->sf.adaptive_mode_search) {
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+    }
   }
 
   if (this_mode == NEWMV) {
@@ -2218,6 +2246,10 @@
    * if the first is known */
   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
+  if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV)
+    return INT64_MAX;
+
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
@@ -2256,6 +2288,13 @@
         } else {
           int rate_sum = 0;
           int64_t dist_sum = 0;
+          if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+              (cpi->sf.interp_filter_search_mask & (1 << i))) {
+            rate_sum = INT_MAX;
+            dist_sum = INT64_MAX;
+            continue;
+          }
+
           if ((cm->interp_filter == SWITCHABLE &&
                (!i || best_needs_copy)) ||
               (cm->interp_filter != SWITCHABLE &&
@@ -2306,6 +2345,7 @@
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mbmi->interp_filter)) {
           pred_exists = 1;
+          tmp_rd = best_rd;
         }
       }
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2324,17 +2364,30 @@
         xd->plane[i].dst.stride = 64;
       }
     }
+    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
   } else {
-    // Handles the special case when a filter that is not in the
-    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-  }
-
-  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     int tmp_rate;
     int64_t tmp_dist;
+    // Handles the special case when a filter that is not in the
+    // switchable list (ex. bilinear) is indicated at the frame level, or
+    // skip condition holds.
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+  }
+
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+  if (cpi->sf.adaptive_mode_search)
+    if (is_comp_pred)
+      if (single_skippable[this_mode][refs[0]] &&
+          single_skippable[this_mode][refs[1]])
+        vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
@@ -2344,7 +2397,7 @@
   }
 
   if (cm->interp_filter == SWITCHABLE)
-    *rate2 += vp9_get_switchable_rate(cpi);
+    *rate2 += rs;
 
   if (!is_comp_pred) {
     if (cpi->allow_encode_breakout)
@@ -2361,8 +2414,9 @@
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                          bsize, txfm_cache, ref_best_rd);
+    vp9_subtract_plane(x, bsize, 0);
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                    bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2392,6 +2446,9 @@
     *skippable = skippable_y && skippable_uv;
   }
 
+  if (!is_comp_pred)
+    single_skippable[this_mode][refs[0]] = *skippable;
+
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return this_rd;  // if 0, this will be re-calculated by caller
 }
@@ -2498,10 +2555,12 @@
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i;
+  int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+  INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
@@ -2512,30 +2571,27 @@
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
+  int best_mode_skippable = 0;
   int mode_index, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vp9_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   int64_t best_inter_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
-  const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
-  const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
-  int mode_skip_mask = 0;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = cpi->sf.mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
-  const int intra_y_mode_mask =
-      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   vp9_zero(best_mbmode);
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -2552,6 +2608,12 @@
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
     x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+      single_inter_filter[i][k] = SWITCHABLE;
+      single_skippable[i][k] = 0;
+    }
+  }
 
   *returnrate = INT_MAX;
 
@@ -2566,23 +2628,17 @@
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    // All modes from vp9_mode_order that use this frame as any ref
-    static const int ref_frame_mask_all[] = {
-        0x0, 0x123291, 0x25c444, 0x39b722
-    };
-    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
-    // this frame as their primary ref
-    static const int ref_frame_mask_fixedmv[] = {
-        0x0, 0x121281, 0x24c404, 0x080102
-    };
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-      // Skip modes for missing references
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     } else if (cpi->sf.reference_masking) {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
           break;
         }
       }
@@ -2591,7 +2647,8 @@
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
   }
 
@@ -2604,35 +2661,32 @@
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      mode_skip_mask =
-          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARESTA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
     }
   }
 
-  // TODO(JBB): This is to make up for the fact that we don't have sad
-  // functions that work when the block size reads outside the umv.  We
-  // should fix this either by making the motion search just work on
-  // a representative block in the boundary ( first ) and then implement a
-  // function that does sads when inside the border..
-  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
-    const int new_modes_mask =
-        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
-        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
-    mode_skip_mask |= new_modes_mask;
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (cpi->sf.alt_ref_search_fp) {
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
   }
 
   if (bsize > cpi->sf.max_intra_bsize) {
-    const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
-        (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
-        (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
-        (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
-    mode_skip_mask |= all_intra_modes;
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }
 
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -2647,20 +2701,26 @@
     int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (mode_index == mode_skip_start && best_mode_index >= 0) {
-      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+      switch (best_mbmode.ref_frame[0]) {
         case INTRA_FRAME:
           break;
         case LAST_FRAME:
-          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case GOLDEN_FRAME:
-          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case ALTREF_FRAME:
-          mode_skip_mask |= ALT_REF_MODE_MASK;
+          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
           break;
         case NONE:
         case MAX_REF_FRAMES:
@@ -2668,7 +2728,12 @@
           break;
       }
     }
-    if (mode_skip_mask & (1 << mode_index))
+
+    if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
+        ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+      continue;
+
+    if (mode_skip_mask[ref_frame] & (1 << this_mode))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -2676,17 +2741,63 @@
                             rd_thresh_freq_fact[mode_index]))
       continue;
 
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
-      continue;
-    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+    if (cpi->sf.motion_field_mode_search) {
+      const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
+                                tile->mi_col_end - mi_col);
+      const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
+                                tile->mi_row_end - mi_row);
+      const int bsl = mi_width_log2(bsize);
+      int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
+          + get_chessboard_index(cm->current_video_frame)) & 0x1;
+      MB_MODE_INFO *ref_mbmi;
+      int const_motion = 1;
+      int skip_ref_frame = !cb_partition_search_ctrl;
+      MV_REFERENCE_FRAME rf = NONE;
+      int_mv ref_mv;
+      ref_mv.as_int = INVALID_MV;
+
+      if ((mi_row - 1) >= tile->mi_row_start) {
+        ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
+        rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
+        for (i = 0; i < mi_width; ++i) {
+          ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
+          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+                          (ref_frame == ref_mbmi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+        }
+      }
+
+      if ((mi_col - 1) >= tile->mi_col_start) {
+        if (ref_mv.as_int == INVALID_MV)
+          ref_mv = xd->mi[-1]->mbmi.mv[0];
+        if (rf == NONE)
+          rf = xd->mi[-1]->mbmi.ref_frame[0];
+        for (i = 0; i < mi_height; ++i) {
+          ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
+          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+                          (ref_frame == ref_mbmi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+        }
+      }
+
+      if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
+        if (rf > INTRA_FRAME)
+          if (ref_frame != rf)
+            continue;
+
+      if (const_motion)
+        if (this_mode == NEARMV || this_mode == ZEROMV)
+          continue;
+    }
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+      if (!cm->allow_comp_inter_inter)
+        continue;
+
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
           best_mode_index >=0 &&
-          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
+          best_mbmode.ref_frame[0] == INTRA_FRAME)
         continue;
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
           ref_frame != best_inter_ref_frame &&
@@ -2699,8 +2810,10 @@
     }
 
     if (ref_frame == INTRA_FRAME) {
-      if (!(intra_y_mode_mask & (1 << this_mode)))
-        continue;
+      if (cpi->sf.adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+          continue;
+
       if (this_mode != DC_PRED) {
         // Disable intra modes other than DC_PRED for blocks with low variance
         // Threshold for intra skipping based on source variance
@@ -2714,7 +2827,7 @@
         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
           if (best_mode_index >= 0 &&
-              vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+              best_mbmode.ref_frame[0] > INTRA_FRAME)
             continue;
         }
         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -2725,7 +2838,7 @@
     } else {
       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
       if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                              inter_mode_mask, this_mode, ref_frames))
+                              this_mode, ref_frames))
         continue;
     }
 
@@ -2737,6 +2850,8 @@
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -2752,8 +2867,8 @@
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
-                            bsize, tx_cache, best_rd);
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      NULL, bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
@@ -2783,7 +2898,8 @@
                                   &rate_uv, &distortion_uv,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
-                                  single_newmv, &total_sse, best_rd);
+                                  single_newmv, single_inter_filter,
+                                  single_skippable, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
 
@@ -2860,12 +2976,6 @@
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       int max_plane = MAX_MB_PLANE;
@@ -2877,6 +2987,8 @@
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
           max_plane = 1;
+        } else {
+          best_pred_sse = x->pred_sse[ref_frame];
         }
 
         *returnrate = rate2;
@@ -2884,6 +2996,8 @@
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        best_mode_skippable = skippable;
+
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
@@ -2983,13 +3097,35 @@
       break;
   }
 
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV) {
+    const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+        best_mbmode.ref_frame[1]};
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARESTMV;
+    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARMV;
+    else if (best_mbmode.mv[0].as_int == 0 &&
+        ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+      best_mbmode.mode = ZEROMV;
+  }
+
   if (best_mode_index < 0 || best_rd >= best_rd_so_far)
     return INT64_MAX;
 
   // If we used an estimate for the uv intra rd in the loop above...
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       TX_SIZE uv_tx_size;
       *mbmi = best_mbmode;
       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
@@ -3039,9 +3175,8 @@
     vp9_zero(best_tx_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  store_coding_context(x, ctx, best_mode_index,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
+  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+                       best_tx_diff, best_filter_diff, best_mode_skippable);
 
   return best_rd;
 }
@@ -3146,7 +3281,7 @@
   if (!x->select_tx_size)
     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
+                       best_pred_diff, best_tx_diff, best_filter_diff, 0);
 
   return this_rd;
 }
@@ -3283,6 +3418,9 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+      if (!cm->allow_comp_inter_inter)
+        continue;
+
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
       // Do not allow compound prediction if the segment level reference frame
@@ -3316,14 +3454,12 @@
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
+    } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative. We allow near/nearest as well
@@ -3749,9 +3885,8 @@
     vp9_zero(best_filter_diff);
   }
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_ref_index,
-                       best_pred_diff, best_tx_diff, best_filter_diff);
+                       best_pred_diff, best_tx_diff, best_filter_diff, 0);
 
   return best_rd;
 }
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index d062636..cee6ce1 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -14,6 +14,9 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 #include "vp9/encoder/vp9_variance.h"
 
 static INLINE unsigned int sad(const uint8_t *a, int a_stride,
@@ -131,3 +134,138 @@
 sadMxNxK(4, 4, 3)
 sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride,
+                                     const uint16_t *b, int b_stride,
+                                     int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define high_sadMxN(m, n) \
+unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride) { \
+  return high_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref, int ref_stride, \
+                                           const uint8_t *second_pred) { \
+  uint16_t comp_pred[m * n]; \
+  vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return high_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define high_sadMxNxK(m, n, k) \
+void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < k; ++i) \
+    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+}
+
+#define high_sadMxNx4D(m, n) \
+void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *const refs[], \
+                                  int ref_stride, unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < 4; ++i) \
+    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+}
+
+// 64x64
+high_sadMxN(64, 64)
+high_sadMxNxK(64, 64, 3)
+high_sadMxNxK(64, 64, 8)
+high_sadMxNx4D(64, 64)
+
+// 64x32
+high_sadMxN(64, 32)
+high_sadMxNx4D(64, 32)
+
+// 32x64
+high_sadMxN(32, 64)
+high_sadMxNx4D(32, 64)
+
+// 32x32
+high_sadMxN(32, 32)
+high_sadMxNxK(32, 32, 3)
+high_sadMxNxK(32, 32, 8)
+high_sadMxNx4D(32, 32)
+
+// 32x16
+high_sadMxN(32, 16)
+high_sadMxNx4D(32, 16)
+
+// 16x32
+high_sadMxN(16, 32)
+high_sadMxNx4D(16, 32)
+
+// 16x16
+high_sadMxN(16, 16)
+high_sadMxNxK(16, 16, 3)
+high_sadMxNxK(16, 16, 8)
+high_sadMxNx4D(16, 16)
+
+// 16x8
+high_sadMxN(16, 8)
+high_sadMxNxK(16, 8, 3)
+high_sadMxNxK(16, 8, 8)
+high_sadMxNx4D(16, 8)
+
+// 8x16
+high_sadMxN(8, 16)
+high_sadMxNxK(8, 16, 3)
+high_sadMxNxK(8, 16, 8)
+high_sadMxNx4D(8, 16)
+
+// 8x8
+high_sadMxN(8, 8)
+high_sadMxNxK(8, 8, 3)
+high_sadMxNxK(8, 8, 8)
+high_sadMxNx4D(8, 8)
+
+// 8x4
+high_sadMxN(8, 4)
+high_sadMxNxK(8, 4, 8)
+high_sadMxNx4D(8, 4)
+
+// 4x8
+high_sadMxN(4, 8)
+high_sadMxNxK(4, 8, 8)
+high_sadMxNx4D(4, 8)
+
+// 4x4
+high_sadMxN(4, 4)
+high_sadMxNxK(4, 4, 3)
+high_sadMxNxK(4, 4, 8)
+high_sadMxNx4D(4, 4)
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index a426453..75d86ce 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -13,54 +13,26 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
 
-enum {
-  INTRA_ALL       = (1 << DC_PRED) |
-                    (1 << V_PRED) | (1 << H_PRED) |
-                    (1 << D45_PRED) | (1 << D135_PRED) |
-                    (1 << D117_PRED) | (1 << D153_PRED) |
-                    (1 << D207_PRED) | (1 << D63_PRED) |
-                    (1 << TM_PRED),
-  INTRA_DC        = (1 << DC_PRED),
-  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
-  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
-  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
-                    (1 << H_PRED)
-};
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
+         vp9_is_upper_layer_key_frame(cpi);
+}
 
-enum {
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
-  INTER_NEAREST = (1 << NEARESTMV),
-  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
-};
-
-enum {
-  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD) |
-                              (1 << THR_LAST),
-
-  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
-
-  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
-
-  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD)
-};
 
 static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
                                    SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
   sf->adaptive_rd_thresh = 1;
-  sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
   sf->allow_skip_recode = 1;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check  = 1;
-    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
-                                                      : USE_LARGESTALL;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
@@ -80,19 +52,26 @@
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+    sf->tx_size_search_breakout = 1;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    sf->partition_search_breakout_rate_thr = 500;
   }
 
   if (speed >= 2) {
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
+
     if (MIN(cm->width, cm->height) >= 720) {
-      sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
-      sf->last_partitioning_redo_frequency = 3;
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-      sf->last_partitioning_redo_frequency = 2;
-      sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
     }
 
     sf->reference_masking = 1;
@@ -102,9 +81,13 @@
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->adjust_partitioning_from_last_frame = 1;
+    sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+    sf->partition_search_breakout_rate_thr = 700;
   }
 
   if (speed >= 3) {
@@ -112,21 +95,28 @@
                                                         : USE_LARGESTALL;
     if (MIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
-      sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
     } else {
+      sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
     }
-
     sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
-
-    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
-    sf->last_partitioning_redo_frequency = 3;
+    sf->alt_ref_search_fp = 1;
+    sf->motion_field_mode_search = !boosted;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
-    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
-    sf->use_fast_coef_costing = 1;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+    sf->adaptive_interp_filter_search = 1;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    sf->partition_search_breakout_rate_thr = 1000;
   }
 
   if (speed >= 4) {
@@ -137,8 +127,15 @@
     sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
                                   FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->use_lp32x32fdct = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    sf->partition_search_breakout_rate_thr = 1500;
   }
 
   if (speed >= 5) {
@@ -162,8 +159,8 @@
 static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
-  const int frames_since_key =
-      cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
+  const int is_keyframe = cm->frame_type == KEY_FRAME;
+  const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
@@ -181,6 +178,7 @@
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
 
     sf->use_rd_breakout = 1;
+
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 1;
     sf->mv.auto_mv_step_size = 1;
@@ -258,17 +256,16 @@
   }
 
   if (speed >= 5) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
-    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
-        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->use_quant_fp = !is_keyframe;
+    sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+                                                  : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
     sf->partition_check =
         (frames_since_key % sf->last_partitioning_redo_frequency == 1);
-    sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
-        (frames_since_key %
-            (sf->last_partitioning_redo_frequency << 1) == 1);
-    sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+    sf->force_frame_boost = is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
@@ -286,8 +283,7 @@
     sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
 
-    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
-        USE_LARGESTALL : USE_TX_8X8;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
 
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -297,6 +293,7 @@
 
     sf->mv.reduce_first_step_size = 1;
   }
+
   if (speed >= 7) {
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
@@ -305,10 +302,12 @@
         800 : 300;
     sf->elevate_newmv_thresh = 2500;
   }
+
   if (speed >= 12) {
     sf->elevate_newmv_thresh = 4000;
     sf->mv.subpel_force_stop = 2;
   }
+
   if (speed >= 13) {
     int i;
     sf->max_intra_bsize = BLOCK_32X32;
@@ -341,8 +340,11 @@
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
   sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
+  sf->motion_field_mode_search = 0;
+  sf->alt_ref_search_fp = 0;
   sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;
@@ -358,8 +360,9 @@
   sf->mode_search_skip_flags = 0;
   sf->force_frame_boost = 0;
   sf->max_delta_qindex = 0;
-  sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
+  sf->adaptive_interp_filter_search = 0;
+
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
     sf->intra_uv_mode_mask[i] = INTRA_ALL;
@@ -386,21 +389,20 @@
   // Recode loop tolerence %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
+  sf->tx_size_search_breakout = 0;
+  sf->partition_search_breakout_dist_thr = 0;
+  sf->partition_search_breakout_rate_thr = 0;
 
-  switch (oxcf->mode) {
-    case ONE_PASS_BEST:
-    case TWO_PASS_SECOND_BEST:  // This is the best quality mode.
-      cpi->diamond_search_sad = vp9_full_range_search;
-      break;
-    case TWO_PASS_FIRST:
-    case ONE_PASS_GOOD:
-    case TWO_PASS_SECOND_GOOD:
-      set_good_speed_feature(cpi, cm, sf, oxcf->speed);
-      break;
-    case REALTIME:
-      set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
-      break;
-  }
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+  cpi->full_search_sad = vp9_full_search_sad;
+  cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
+                                               : vp9_diamond_search_sad;
+  cpi->refining_search_sad = vp9_refining_search_sad;
+
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -415,6 +417,8 @@
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index de731ce..0f49154 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -17,6 +17,44 @@
 extern "C" {
 #endif
 
+enum {
+  INTRA_ALL       = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -40,6 +78,7 @@
 
 typedef enum {
   SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;
 
@@ -63,7 +102,8 @@
 typedef enum {
   NOT_IN_USE = 0,
   RELAXED_NEIGHBORING_MIN_MAX = 1,
-  STRICT_NEIGHBORING_MIN_MAX = 2
+  CONSTRAIN_NEIGHBORING_MIN_MAX = 2,
+  STRICT_NEIGHBORING_MIN_MAX = 3
 } AUTO_MIN_MAX_MODE;
 
 typedef enum {
@@ -102,6 +142,12 @@
 } MODE_SEARCH_SKIP_LOGIC;
 
 typedef enum {
+  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
   // Search partitions using RD/NONRD criterion
   SEARCH_PARTITION = 0,
 
@@ -283,11 +329,18 @@
   // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_interp_filter;
 
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
   // Chessboard pattern prediction filter type search
   int cb_pred_filter_search;
 
   int cb_partition_search;
 
+  int motion_field_mode_search;
+
+  int alt_ref_search_fp;
+
   // Fast quantization process path
   int use_quant_fp;
 
@@ -307,9 +360,6 @@
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
 
-  // A source variance threshold below which the split mode is disabled
-  unsigned int disable_split_var_thresh;
-
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
@@ -374,6 +424,20 @@
 
   // default interp filter choice
   INTERP_FILTER default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_ssim.c b/vp9/encoder/vp9_ssim.c
index 026e6a8..8435640 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -95,7 +95,7 @@
   return ssim_total;
 }
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight) {
+                     double *weight) {
   double a, b, c;
   double ssimv;
 
diff --git a/vp9/encoder/vp9_ssim.h b/vp9/encoder/vp9_ssim.h
index a581c2c..d1dd1b7 100644
--- a/vp9/encoder/vp9_ssim.h
+++ b/vp9/encoder/vp9_ssim.h
@@ -18,7 +18,7 @@
 #include "vpx_scale/yv12config.h"
 
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight);
+                     double *weight);
 
 double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index bf949c4..7545d87 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -19,12 +19,12 @@
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int layer;
   int layer_end;
-  int alt_ref_idx = svc->number_spatial_layers;
+  int alt_ref_idx = svc->number_spatial_layers * svc->number_temporal_layers;
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
 
-  if (svc->number_temporal_layers > 1) {
+  if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
     layer_end = svc->number_temporal_layers;
   } else {
     layer_end = svc->number_spatial_layers;
@@ -36,6 +36,8 @@
     int i;
     lc->current_video_frame_in_layer = 0;
     lc->layer_size = 0;
+    lc->frames_from_key_frame = 0;
+    lc->last_frame_type = FRAME_TYPES;
     lrc->ni_av_qi = oxcf->worst_allowed_q;
     lrc->total_actual_bits = 0;
     lrc->total_target_vs_actual = 0;
@@ -50,7 +52,7 @@
       lrc->rate_correction_factors[i] = 1.0;
     }
 
-    if (svc->number_temporal_layers > 1) {
+    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
       lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
       lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
       lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
@@ -69,13 +71,14 @@
       lc->gold_ref_idx = -1;
     }
 
-    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
-                                    lc->target_bandwidth, 1000);
+    lrc->buffer_level = oxcf->starting_buffer_level_ms *
+                            lc->target_bandwidth / 1000;
     lrc->bits_off_target = lrc->buffer_level;
   }
 
   // Still have extra buffer for base layer golden frame
-  if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+  if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+      && alt_ref_idx < REF_FRAMES)
     svc->layer_context[0].gold_ref_idx = alt_ref_idx;
 }
 
@@ -89,7 +92,7 @@
   int layer_end;
   float bitrate_alloc = 1.0;
 
-  if (svc->number_temporal_layers > 1) {
+  if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
     layer_end = svc->number_temporal_layers;
   } else {
     layer_end = svc->number_spatial_layers;
@@ -99,7 +102,7 @@
     LAYER_CONTEXT *const lc = &svc->layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
 
-    if (svc->number_temporal_layers > 1) {
+    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
       lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
     } else {
       lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
@@ -115,10 +118,10 @@
     lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.
-    if (svc->number_temporal_layers > 1) {
-      lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+      lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
     } else {
-      lc->framerate = oxcf->framerate;
+      lc->framerate = cpi->framerate;
     }
     lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
     lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -128,20 +131,20 @@
   }
 }
 
-static LAYER_CONTEXT *get_layer_context(SVC *svc) {
-  return svc->number_temporal_layers > 1 ?
-         &svc->layer_context[svc->temporal_layer_id] :
-         &svc->layer_context[svc->spatial_layer_id];
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+  return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+         &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+         &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
 }
 
 void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   RATE_CONTROL *const lrc = &lc->rc;
   const int layer = svc->temporal_layer_id;
 
-  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+  lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
   lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
@@ -149,7 +152,7 @@
     lc->avg_frame_size = lrc->avg_frame_bandwidth;
   } else {
     const double prev_layer_framerate =
-        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+        cpi->framerate / oxcf->ts_rate_decimator[layer - 1];
     const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
@@ -159,7 +162,7 @@
 
 void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
@@ -172,7 +175,7 @@
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
 
@@ -190,7 +193,7 @@
 
 void vp9_save_layer_context(VP9_COMP *const cpi) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
 
   lc->rc = cpi->rc;
   lc->twopass = cpi->twopass;
@@ -214,15 +217,17 @@
   svc->spatial_layer_id = 0;
 }
 
-void vp9_inc_frame_in_layer(SVC *svc) {
-  LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
-      ? &svc->layer_context[svc->temporal_layer_id]
-      : &svc->layer_context[svc->spatial_layer_id];
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+  LAYER_CONTEXT *const lc =
+      (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+      &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+      &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
   ++lc->current_video_frame_in_layer;
+  ++lc->frames_from_key_frame;
 }
 
 int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
-  return is_spatial_svc(cpi) &&
+  return is_two_pass_svc(cpi) &&
          cpi->svc.spatial_layer_id > 0 &&
          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
 }
@@ -257,6 +262,7 @@
   int layer_id;
   vpx_svc_parameters_t *layer_param;
   LAYER_CONTEXT *lc;
+  int count = 1 << (cpi->svc.number_temporal_layers - 1);
 
   // Find the next layer to be encoded
   for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
@@ -274,17 +280,36 @@
 
   lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
 
-  cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+  cpi->svc.temporal_layer_id = 0;
+  while ((lc->current_video_frame_in_layer % count) != 0) {
+    ++cpi->svc.temporal_layer_id;
+    count >>= 1;
+  }
 
-  if (cpi->svc.spatial_layer_id < 1)
+  cpi->lst_fb_idx =
+      cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+      cpi->svc.temporal_layer_id;
+  if (lc->frames_from_key_frame < cpi->svc.number_temporal_layers)
+    cpi->ref_frame_flags &= ~VP9_LAST_FLAG;
+
+  if (cpi->svc.spatial_layer_id == 0) {
+    if (cpi->svc.temporal_layer_id == 0)
       cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
                         lc->gold_ref_idx : cpi->lst_fb_idx;
-  else
-    cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
+    else
+      cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+  } else {
+    if (cpi->svc.temporal_layer_id == 0)
+      cpi->gld_fb_idx = cpi->svc.spatial_layer_id -
+                        cpi->svc.number_temporal_layers;
+    else
+      cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+  }
 
   if (lc->current_video_frame_in_layer == 0) {
     if (cpi->svc.spatial_layer_id >= 2) {
-      cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+      cpi->alt_fb_idx =
+          cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
     } else {
       cpi->alt_fb_idx = cpi->lst_fb_idx;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
@@ -306,7 +331,8 @@
             lc_lower->alt_ref_source != NULL)
           cpi->alt_fb_idx = lc_lower->alt_ref_idx;
         else if (cpi->svc.spatial_layer_id >= 2)
-          cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+          cpi->alt_fb_idx =
+              cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
         else
           cpi->alt_fb_idx = cpi->lst_fb_idx;
       }
@@ -325,7 +351,7 @@
 
   vp9_set_high_precision_mv(cpi, 1);
 
-  cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source;
+  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
 
   return 0;
 }
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 801449b..d180d1a 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -25,15 +25,18 @@
   double framerate;
   int avg_frame_size;
   TWO_PASS twopass;
-  struct vpx_fixed_buf rc_twopass_stats_in;
+  vpx_fixed_buf_t rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
   int is_key_frame;
+  int frames_from_key_frame;
+  FRAME_TYPE last_frame_type;
   vpx_svc_parameters_t svc_params_received;
   struct lookahead_entry  *alt_ref_source;
   int alt_ref_idx;
   int gold_ref_idx;
   int has_alt_frame;
   size_t layer_size;
+  struct vpx_psnr_pkt psnr_pkt;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -80,7 +83,7 @@
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 
 // Increment number of video frames in layer
-void vp9_inc_frame_in_layer(SVC *svc);
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
 
 // Check if current layer is key frame in spatial upper layer
 int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ce3b311..18a6a91 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -145,6 +145,7 @@
   int bestsme = INT_MAX;
   int distortion;
   unsigned int sse;
+  int sad_list[5];
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -168,6 +169,7 @@
 
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 cond_sad_list(cpi, sad_list),
                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -177,6 +179,7 @@
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
+                                         cond_sad_list(cpi, sad_list),
                                          NULL, NULL,
                                          &distortion, &sse, NULL, 0, 0);
 
@@ -188,6 +191,7 @@
 }
 
 static void temporal_filter_iterate_c(VP9_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
                                       int frame_count,
                                       int alt_ref_index,
                                       int strength,
@@ -203,7 +207,7 @@
   DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
   DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
   MACROBLOCKD *mbd = &cpi->mb.e_mbd;
-  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
@@ -247,7 +251,7 @@
         const int thresh_low  = 10000;
         const int thresh_high = 20000;
 
-        if (cpi->frames[frame] == NULL)
+        if (frames[frame] == NULL)
           continue;
 
         mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
@@ -258,9 +262,9 @@
         } else {
           // Find best match in this frame by MC
           int err = temporal_filter_find_matching_mb_c(cpi,
-              cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->y_stride);
+              frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->y_stride);
 
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
@@ -272,10 +276,10 @@
         if (filter_weight != 0) {
           // Construct the predictors
           temporal_filter_predictors_mb_c(mbd,
-              cpi->frames[frame]->y_buffer + mb_y_offset,
-              cpi->frames[frame]->u_buffer + mb_uv_offset,
-              cpi->frames[frame]->v_buffer + mb_uv_offset,
-              cpi->frames[frame]->y_stride,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset,
+              frames[frame]->y_stride,
               mb_uv_width, mb_uv_height,
               mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
               mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
@@ -357,12 +361,14 @@
 
 // Apply buffer limits and context specific adjustments to arnr filter.
 static void adjust_arnr_filter(VP9_COMP *cpi,
-                               int distance, int group_boost) {
+                               int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int frames_after_arf =
-            vp9_lookahead_depth(cpi->lookahead) - distance - 1;
+      vp9_lookahead_depth(cpi->lookahead) - distance - 1;
   int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
   int frames_bwd;
-  int q;
+  int q, frames, strength;
 
   // Define the forward and backwards filter limits for this arnr group.
   if (frames_fwd > frames_after_arf)
@@ -375,10 +381,10 @@
   // For even length filter there is one more frame backward
   // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
   if (frames_bwd < distance)
-    frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+    frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
 
   // Set the baseline active filter size.
-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+  frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q.
   if (cpi->common.current_video_frame > 1)
@@ -388,29 +394,33 @@
     q = ((int)vp9_convert_qindex_to_q(
         cpi->rc.avg_frame_qindex[KEY_FRAME]));
   if (q > 16) {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
+    strength = oxcf->arnr_strength;
   } else {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2);
-    if (cpi->active_arnr_strength < 0)
-      cpi->active_arnr_strength = 0;
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0)
+      strength = 0;
   }
 
   // Adjust number of frames in filter and strength based on gf boost level.
-  if (cpi->active_arnr_frames > (group_boost / 150)) {
-    cpi->active_arnr_frames = (group_boost / 150);
-    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
   }
-  if (cpi->active_arnr_strength > (group_boost / 300)) {
-    cpi->active_arnr_strength = (group_boost / 300);
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
   }
 
   // Adjustments for second level arf in multi arf case.
   if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
-      cpi->active_arnr_strength >>= 1;
+      strength >>= 1;
     }
   }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
 }
 
 void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
@@ -423,26 +433,24 @@
   int frames_to_blur_backward;
   int frames_to_blur_forward;
   struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
 
   // Apply context specific adjustments to the arnr filter parameters.
-  adjust_arnr_filter(cpi, distance, rc->gfu_boost);
-  strength = cpi->active_arnr_strength;
-  frames_to_blur = cpi->active_arnr_frames;
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
   start_frame = distance + frames_to_blur_forward;
 
   // Setup frame pointers, NULL indicates frame not included in filter.
-  vp9_zero(cpi->frames);
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
                                                      which_buffer);
-    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+    frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  if (is_spatial_svc(cpi)) {
+  if (is_two_pass_svc(cpi)) {
     // In spatial svc the scaling factors might be less then 1/2. So we will use
     // non-normative scaling.
     int frame_used = 0;
@@ -453,19 +461,21 @@
                                       get_frame_new_buffer(cm)->y_crop_height);
 
     for (frame = 0; frame < frames_to_blur; ++frame) {
-      if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width ||
-          cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) {
+      if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+          cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
         if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
                                      cm->width, cm->height,
                                      cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
                                      VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
                                      NULL))
           vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                              "Failed to reallocate alt_ref_buffer");
 
-        cpi->frames[frame] =
-            vp9_scale_if_required(cm, cpi->frames[frame],
-                                  &cpi->svc.scaled_frames[frame_used]);
+        frames[frame] = vp9_scale_if_required(cm, frames[frame],
+                            &cpi->svc.scaled_frames[frame_used]);
         ++frame_used;
       }
     }
@@ -476,6 +486,6 @@
                                       cm->width, cm->height);
   }
 
-  temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
-                            strength, &sf);
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
 }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 6068b85..263883f 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -212,7 +212,7 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index eb5ae2e..c97f93f 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -103,8 +103,9 @@
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
 
-  for (i = 0; i < 256; i++)
+  for (i = 0; i < 256; ++i) {
     sum += src_ptr[i] * src_ptr[i];
+  }
 
   return sum;
 }
@@ -266,3 +267,375 @@
     ref += ref_stride;
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance64(const uint8_t *a8, int  a_stride,
+                     const uint8_t *b8, int  b_stride,
+                     int w, int h, uint64_t *sse,
+                     uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+void high_variance(const uint8_t *a8, int  a_stride,
+                   const uint8_t *b8, int  b_stride,
+                   int w, int h, unsigned int *sse,
+                   int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = sse_long;
+  *sum = sum_long;
+}
+
+void high_10_variance(const uint8_t *a8, int  a_stride,
+                      const uint8_t *b8, int  b_stride,
+                      int w, int h, unsigned int *sse,
+                      int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+void high_12_variance(const uint8_t *a8, int  a_stride,
+                      const uint8_t *b8, int  b_stride,
+                      int w, int h, unsigned int *sse,
+                      int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static void high_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int16_t *vp9_filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                             (int)src_ptr[pixel_step] * vp9_filter[1],
+                             FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void high_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                             (int)src_ptr[pixel_step] * vp9_filter[1],
+                             FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGH_VAR(W, H) \
+unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                                unsigned int *sse) { \
+  int sum; \
+  high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               unsigned int *sse) { \
+  int sum; \
+  high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGH_SUBPIX_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                           dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                           dst_stride, sse); \
+}
+
+#define HIGH_SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+}
+
+#define HIGH_GET_VAR(S) \
+void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *ref, int ref_stride, \
+                                  unsigned int *sse, int *sum) { \
+  high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGH_MSE(W, H) \
+unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse) { \
+  int sum; \
+  high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          unsigned int *sse) { \
+  int sum; \
+  high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          unsigned int *sse) { \
+  int sum; \
+  high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+HIGH_GET_VAR(8)
+HIGH_GET_VAR(16)
+
+HIGH_MSE(16, 16)
+HIGH_MSE(16, 8)
+HIGH_MSE(8, 16)
+HIGH_MSE(8, 8)
+
+HIGH_VAR(4, 4)
+HIGH_SUBPIX_VAR(4, 4)
+HIGH_SUBPIX_AVG_VAR(4, 4)
+
+HIGH_VAR(4, 8)
+HIGH_SUBPIX_VAR(4, 8)
+HIGH_SUBPIX_AVG_VAR(4, 8)
+
+HIGH_VAR(8, 4)
+HIGH_SUBPIX_VAR(8, 4)
+HIGH_SUBPIX_AVG_VAR(8, 4)
+
+HIGH_VAR(8, 8)
+HIGH_SUBPIX_VAR(8, 8)
+HIGH_SUBPIX_AVG_VAR(8, 8)
+
+HIGH_VAR(8, 16)
+HIGH_SUBPIX_VAR(8, 16)
+HIGH_SUBPIX_AVG_VAR(8, 16)
+
+HIGH_VAR(16, 8)
+HIGH_SUBPIX_VAR(16, 8)
+HIGH_SUBPIX_AVG_VAR(16, 8)
+
+HIGH_VAR(16, 16)
+HIGH_SUBPIX_VAR(16, 16)
+HIGH_SUBPIX_AVG_VAR(16, 16)
+
+HIGH_VAR(16, 32)
+HIGH_SUBPIX_VAR(16, 32)
+HIGH_SUBPIX_AVG_VAR(16, 32)
+
+HIGH_VAR(32, 16)
+HIGH_SUBPIX_VAR(32, 16)
+HIGH_SUBPIX_AVG_VAR(32, 16)
+
+HIGH_VAR(32, 32)
+HIGH_SUBPIX_VAR(32, 32)
+HIGH_SUBPIX_AVG_VAR(32, 32)
+
+HIGH_VAR(32, 64)
+HIGH_SUBPIX_VAR(32, 64)
+HIGH_SUBPIX_AVG_VAR(32, 64)
+
+HIGH_VAR(64, 32)
+HIGH_SUBPIX_VAR(64, 32)
+HIGH_SUBPIX_AVG_VAR(64, 32)
+
+HIGH_VAR(64, 64)
+HIGH_SUBPIX_VAR(64, 64)
+HIGH_SUBPIX_AVG_VAR(64, 64)
+
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                            int width, int height, const uint8_t *ref8,
+                            int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4a194b7..c51d08d 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -22,6 +22,23 @@
               int  w, int  h,
               unsigned int *sse, int *sum);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance(const uint8_t *a8, int a_stride,
+                   const uint8_t *b8, int b_stride,
+                   int w, int h,
+                   unsigned int *sse, int *sum);
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+                      const uint8_t *b8, int b_stride,
+                      int w, int h,
+                      unsigned int *sse, int *sum);
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+                      const uint8_t *b8, int b_stride,
+                      int w, int h,
+                      unsigned int *sse, int *sum);
+#endif
+
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
@@ -81,6 +98,11 @@
 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                        int height, const uint8_t *ref, int ref_stride);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 487deef..e799951 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,6 +12,8 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
 void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
   __m128i in0, in1;
   __m128i tmp;
@@ -780,58 +782,6 @@
   _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
 }
 
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
 void fdct8_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -1953,23 +1903,6 @@
   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   // perform rounding operations
   right_shift_8x8(res0, 2);
@@ -2891,11 +2824,11 @@
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
+#undef  FDCT32x32_2D
 
 #define FDCT32x32_2D vp9_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
-#undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
+#undef  FDCT32x32_2D
diff --git a/vp9/encoder/x86/vp9_sad_mmx.asm b/vp9/encoder/x86/vp9_sad_mmx.asm
deleted file mode 100644
index 32fdd23..0000000
--- a/vp9/encoder/x86/vp9_sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp9_sad16x16_mmx) PRIVATE
-global sym(vp9_sad8x16_mmx) PRIVATE
-global sym(vp9_sad8x8_mmx) PRIVATE
-global sym(vp9_sad4x4_mmx) PRIVATE
-global sym(vp9_sad16x8_mmx) PRIVATE
-
-;unsigned int vp9_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vp9_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
index 9aa4da9..a441cad 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -333,7 +333,7 @@
     if (y_offset == 0) {
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -347,7 +347,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -369,7 +369,7 @@
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_reg, zero_reg)
@@ -385,7 +385,7 @@
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -409,7 +409,7 @@
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
         src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -437,7 +437,7 @@
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
@@ -459,7 +459,7 @@
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         MERGE_WITH_SRC(src_reg, zero_reg)
         sec+= sec_stride;
@@ -487,7 +487,7 @@
         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_pack, zero_reg)
@@ -524,7 +524,7 @@
         // filter the source
         FILTER_SRC(yfilter)
         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index 7f81f46..ea09b95 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -12,67 +12,39 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
-typedef void (*get_var_avx2) (
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse, int *sum);
 
-void vp9_get16x16var_avx2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
+void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
 
-void vp9_get32x32var_avx2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
+void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
 
-unsigned int vp9_sub_pixel_variance32xh_avx2
-(
-  const uint8_t *src,
-  int src_stride,
-  int x_offset,
-  int y_offset,
-  const uint8_t *dst,
-  int dst_stride,
-  int height,
-  unsigned int *sse
-);
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height,
+                                             unsigned int *sse);
 
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2
-(
-  const uint8_t *src,
-  int src_stride,
-  int x_offset,
-  int y_offset,
-  const uint8_t *dst,
-  int dst_stride,
-  const uint8_t *sec,
-  int sec_stride,
-  int height,
-  unsigned int *sseptr
-);
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                                 int src_stride,
+                                                 int x_offset,
+                                                 int y_offset,
+                                                 const uint8_t *dst,
+                                                 int dst_stride,
+                                                 const uint8_t *sec,
+                                                 int sec_stride,
+                                                 int height,
+                                                 unsigned int *sseptr);
 
-static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
-                        const unsigned char *ref_ptr, int  recon_stride,
-                        int  w, int  h, unsigned int *sse, int *sum,
-                        get_var_avx2 var_fn, int block_size) {
-  unsigned int sse0;
-  int sum0;
+static void variance_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int  ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          get_var_avx2 var_fn, int block_size) {
   int i, j;
 
   *sse = 0;
@@ -80,105 +52,68 @@
 
   for (i = 0; i < h; i += 16) {
     for (j = 0; j < w; j += block_size) {
-      // processing 16 rows horizontally each call
-      var_fn(src_ptr + source_stride * i + j, source_stride,
-             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      unsigned int sse0;
+      int sum0;
+      var_fn(&src[src_stride * i + j], src_stride,
+             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
   }
 }
 
-unsigned int vp9_variance16x16_avx2
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
 
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
-                &var, &avg, vp9_get16x16var_avx2, 16);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp9_mse16x16_avx2(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                       &sum0);
-  *sse = sse0;
-  return sse0;
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 10));
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vp9_get16x16var_avx2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 
-unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
-                                    unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
+unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
 }
 
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
-                &var, &avg, vp9_get32x32var_avx2, 32);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 12));
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vp9_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
-                                    int  source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  recon_stride,
+unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
-  unsigned int var;
-  int avg;
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vp9_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
 
-  // processing 32 elements vertically in parallel
-  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
-                &var, &avg, vp9_get32x32var_avx2, 32);
+unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vp9_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
 
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
+unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vp9_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 11);
 }
 
 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
@@ -187,22 +122,19 @@
                                               int y_offset,
                                               const uint8_t *dst,
                                               int dst_stride,
-                                              unsigned int *sse_ptr) {
-  // processing 32 elements in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                           y_offset, dst, dst_stride,
-                                           64, &sse);
-  // processing the next 32 elements in parallel
+                                              unsigned int *sse) {
+  unsigned int sse1;
+  const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                  y_offset, dst, dst_stride,
+                                                  64, &sse1);
   unsigned int sse2;
-  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
-                                            x_offset, y_offset,
-                                            dst + 32, dst_stride,
-                                            64, &sse2);
-  se += se2;
-  sse += sse2;
-  *sse_ptr = sse;
-  return sse - (((int64_t)se * se) >> 12);
+  const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                                  x_offset, y_offset,
+                                                  dst + 32, dst_stride,
+                                                  64, &sse2);
+  const int se = se1 + se2;
+  *sse = sse1 + sse2;
+  return *sse - (((int64_t)se * se) >> 12);
 }
 
 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -211,14 +143,11 @@
                                               int y_offset,
                                               const uint8_t *dst,
                                               int dst_stride,
-                                              unsigned int *sse_ptr) {
-  // processing 32 element in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                           y_offset, dst, dst_stride,
-                                           32, &sse);
-  *sse_ptr = sse;
-  return sse - (((int64_t)se * se) >> 10);
+                                              unsigned int *sse) {
+  const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
 }
 
 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
@@ -227,24 +156,22 @@
                                                   int y_offset,
                                                   const uint8_t *dst,
                                                   int dst_stride,
-                                                  unsigned int *sseptr,
+                                                  unsigned int *sse,
                                                   const uint8_t *sec) {
-  // processing 32 elements in parallel
-  unsigned int sse;
-
-  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                               y_offset, dst, dst_stride,
-                                               sec, 64, 64, &sse);
+  unsigned int sse1;
+  const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                      y_offset, dst, dst_stride,
+                                                      sec, 64, 64, &sse1);
   unsigned int sse2;
-  // processing the next 32 elements in parallel
-  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
-                                                y_offset, dst + 32, dst_stride,
-                                                sec + 32, 64, 64, &sse2);
-  se += se2;
-  sse += sse2;
-  *sseptr = sse;
+  const int se2 =
+      vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                          y_offset, dst + 32, dst_stride,
+                                          sec + 32, 64, 64, &sse2);
+  const int se = se1 + se2;
 
-  return sse - (((int64_t)se * se) >> 12);
+  *sse = sse1 + sse2;
+
+  return *sse - (((int64_t)se * se) >> 12);
 }
 
 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
@@ -253,15 +180,11 @@
                                                   int y_offset,
                                                   const uint8_t *dst,
                                                   int dst_stride,
-                                                  unsigned int *sseptr,
+                                                  unsigned int *sse,
                                                   const uint8_t *sec) {
   // processing 32 element in parallel
-  unsigned int sse;
-  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                 y_offset, dst, dst_stride,
-                                                 sec, 32, 32, &sse);
-  *sseptr = sse;
-  return sse - (((int64_t)se * se) >> 10);
+  const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                     y_offset, dst, dst_stride,
+                                                     sec, 32, 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
 }
-
-
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
deleted file mode 100644
index 3501cf1..0000000
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ /dev/null
@@ -1,510 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx) PRIVATE
-sym(vp9_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get8x8var_mmx) PRIVATE
-sym(vp9_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp9_get4x4var_mmx) PRIVATE
-sym(vp9_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx) PRIVATE
-sym(vp9_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
deleted file mode 100644
index 4830412..0000000
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ /dev/null
@@ -1,401 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2) PRIVATE
-sym(vp9_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get16x16var_sse2) PRIVATE
-sym(vp9_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
deleted file mode 100644
index ce1c832..0000000
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
-
-unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *SSE, int *sum);
-
-unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride,
-                                 unsigned int *sse) {
-  int sum;
-  vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride,
-                                 unsigned int *sse) {
-  int sum;
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3;
-  int sum0, sum1, sum2, sum3;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
-  *sse = sse0 + sse1 + sse2 + sse3;
-  return *sse;
-}
-
-
-unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3;
-  int sum0, sum1, sum2, sum3, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
-  *sse = sse0 + sse1 + sse2 + sse3;
-  sum = sum0 + sum1 + sum2 + sum3;
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  unsigned int sse0, sse1;
-  int sum0, sum1, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-
-  *sse = sse0 + sse1;
-  sum = sum0 + sum1;
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-
-unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  unsigned int sse0, sse1;
-  int sum0, sum1, sum;
-
-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
-                    ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
-
-  *sse = sse0 + sse1;
-  sum = sum0 + sum1;
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index e935a23..b4d2b0a 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
@@ -17,18 +19,137 @@
                                        const unsigned char *ref, int ref_stride,
                                        unsigned int *sse, int *sum);
 
-unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
 
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
 
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
-                                const unsigned char *ref, int ref_stride,
-                                unsigned int *sse, int *sum);
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
 
-unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse, int *sum);
+#define READ64(p, stride, i) \
+  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+  // sum
+  __m128i vsum = _mm_add_epi16(diff0, diff1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+                       _mm_madd_epi16(diff1, diff1));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  *sse = _mm_cvtsi128_si32(vsum);
+
+  return 0;
+}
+
+unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+
+  return 0;
+}
+
+unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const __m128i s = _mm_loadu_si128((const __m128i *)src);
+    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+             (int16_t)_mm_extract_epi16(vsum, 1);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+
+  return 0;
+}
+
 
 static void variance_sse2(const unsigned char *src, int src_stride,
                           const unsigned char *ref, int ref_stride,
@@ -55,8 +176,7 @@
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
-                sse, &sum, vp9_get4x4var_mmx, 4);
+  vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
   return *sse - (((unsigned int)sum * sum) >> 4);
 }
 
@@ -65,7 +185,7 @@
                                   unsigned int *sse) {
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
-                sse, &sum, vp9_get4x4var_mmx, 4);
+                sse, &sum, vp9_get4x4var_sse2, 4);
   return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
@@ -74,7 +194,7 @@
                                   unsigned int *sse) {
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
-                sse, &sum, vp9_get4x4var_mmx, 4);
+                sse, &sum, vp9_get4x4var_sse2, 4);
   return *sse - (((unsigned int)sum * sum) >> 5);
 }
 
@@ -82,8 +202,7 @@
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                sse, &sum, vp9_get8x8var_sse2, 8);
+  vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
   return *sse - (((unsigned int)sum * sum) >> 6);
 }
 
@@ -109,17 +228,8 @@
                                     const unsigned char *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
   vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
+  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 
 unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -176,6 +286,34 @@
   return *sse - (((int64_t)sum * sum) >> 11);
 }
 
+unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
 #define DECL(w, opt) \
 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
                                         ptrdiff_t src_stride, \
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8e3e885..90f0342 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -80,7 +80,6 @@
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index bf8eec7..0851d8a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -21,7 +21,6 @@
 #include "vp9/vp9_iface_common.h"
 
 struct vp9_extracfg {
-  struct vpx_codec_pkt_list *pkt_list;
   int                         cpu_used;  // available cpu percentage in 1/16
   unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
@@ -31,7 +30,6 @@
   unsigned int                tile_rows;
   unsigned int                arnr_max_frames;
   unsigned int                arnr_strength;
-  unsigned int                arnr_type;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
@@ -39,41 +37,29 @@
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
-  BIT_DEPTH                   bit_depth;
+  vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
 };
 
-struct extraconfig_map {
-  unsigned int usage;
-  struct vp9_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
-  {
-    0,
-    { // NOLINT
-      NULL,
-      0,                          // cpu_used
-      1,                          // enable_auto_alt_ref
-      0,                          // noise_sensitivity
-      0,                          // sharpness
-      0,                          // static_thresh
-      0,                          // tile_columns
-      0,                          // tile_rows
-      7,                          // arnr_max_frames
-      5,                          // arnr_strength
-      3,                          // arnr_type
-      VP8_TUNE_PSNR,              // tuning
-      10,                         // cq_level
-      0,                          // rc_max_intra_bitrate_pct
-      0,                          // lossless
-      0,                          // frame_parallel_decoding_mode
-      NO_AQ,                      // aq_mode
-      0,                          // frame_periodic_delta_q
-      BITS_8,                     // Bit depth
-      VP9E_CONTENT_DEFAULT        // content
-    }
-  }
+static struct vp9_extracfg default_extra_cfg = {
+  0,                          // cpu_used
+  1,                          // enable_auto_alt_ref
+  0,                          // noise_sensitivity
+  0,                          // sharpness
+  0,                          // static_thresh
+  0,                          // tile_columns
+  0,                          // tile_rows
+  7,                          // arnr_max_frames
+  5,                          // arnr_strength
+  VP8_TUNE_PSNR,              // tuning
+  10,                         // cq_level
+  0,                          // rc_max_intra_bitrate_pct
+  0,                          // lossless
+  0,                          // frame_parallel_decoding_mode
+  NO_AQ,                      // aq_mode
+  0,                          // frame_periodic_delta_q
+  VPX_BITS_8,                 // Bit depth
+  VP9E_CONTENT_DEFAULT        // content
 };
 
 struct vpx_codec_alg_priv {
@@ -177,20 +163,8 @@
   }
 
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
-
-#if CONFIG_SPATIAL_SVC
-  if (cfg->ss_number_layers > 1) {
-    unsigned int i, alt_ref_sum = 0;
-    for (i = 0; i < cfg->ss_number_layers; ++i) {
-      if (cfg->ss_enable_auto_alt_ref[i])
-        ++alt_ref_sum;
-    }
-    if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
-      ERROR("Not enough ref buffers for svc alt ref frames");
-  }
-#endif
-
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+
   if (cfg->ts_number_layers > 1) {
     unsigned int i;
     for (i = 1; i < cfg->ts_number_layers; ++i)
@@ -203,6 +177,28 @@
         ERROR("ts_rate_decimator factors are not powers of 2");
   }
 
+#if CONFIG_SPATIAL_SVC
+  if (cfg->ss_number_layers * cfg->ts_number_layers > REF_FRAMES)
+    ERROR("Too many layers. Maximum 8 layers could be set");
+
+  if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
+      cfg->g_pass == VPX_RC_LAST_PASS) {
+    unsigned int i, alt_ref_sum = 0;
+    for (i = 0; i < cfg->ss_number_layers; ++i) {
+      if (cfg->ss_enable_auto_alt_ref[i])
+        ++alt_ref_sum;
+    }
+    if (alt_ref_sum >
+        REF_FRAMES - cfg->ss_number_layers * cfg->ts_number_layers)
+      ERROR("Not enough ref buffers for svc alt ref frames");
+    if ((cfg->ss_number_layers > 3 ||
+         cfg->ss_number_layers * cfg->ts_number_layers > 4) &&
+        cfg->g_error_resilient == 0)
+    ERROR("Multiple frame context are not supported for more than 3 spatial "
+          "layers or more than 4 spatial x temporal layers");
+  }
+#endif
+
   // VP9 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
   if (cfg->kf_mode != VPX_KF_DISABLED &&
@@ -219,8 +215,9 @@
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
-  RANGE_CHECK(extra_cfg, arnr_type, 1, 3);
   RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content,
               VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
 
@@ -239,7 +236,7 @@
     if (cfg->rc_twopass_stats_in.sz % packet_sz)
       ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
 
-    if (cfg->ss_number_layers > 1) {
+    if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) {
       int i;
       unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0};
 
@@ -279,12 +276,16 @@
     }
   }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+  if (cfg->g_profile > (unsigned int)PROFILE_1)
+    ERROR("Profile > 1 not supported in this build configuration");
+#endif
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth > BITS_8)
-    ERROR("High bit-depth not supported in profile < 2");
+      extra_cfg->bit_depth > VPX_BITS_8)
+    ERROR("Codec high bit-depth not supported in profile < 2");
   if (cfg->g_profile > (unsigned int)PROFILE_1 &&
-      extra_cfg->bit_depth == BITS_8)
-    ERROR("Bit-depth 8 not supported in profile > 1");
+      extra_cfg->bit_depth == VPX_BITS_8)
+    ERROR("Codec bit-depth 8 not supported in profile > 1");
 
   return VPX_CODEC_OK;
 }
@@ -316,6 +317,9 @@
     case VPX_IMG_FMT_I420: return 12;
     case VPX_IMG_FMT_I422: return 16;
     case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I42016: return 24;
+    case VPX_IMG_FMT_I42216: return 32;
+    case VPX_IMG_FMT_I44416: return 48;
     default: assert(0 && "Invalid image format"); break;
   }
   return 0;
@@ -325,26 +329,27 @@
     VP9EncoderConfig *oxcf,
     const vpx_codec_enc_cfg_t *cfg,
     const struct vp9_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
   oxcf->profile = cfg->g_profile;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
   oxcf->bit_depth = extra_cfg->bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
-  oxcf->framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (oxcf->framerate > 180)
-    oxcf->framerate = 30;
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180)
+    oxcf->init_framerate = 30;
+
+  oxcf->mode = GOOD;
 
   switch (cfg->g_pass) {
     case VPX_RC_ONE_PASS:
-      oxcf->mode = ONE_PASS_GOOD;
       oxcf->pass = 0;
       break;
     case VPX_RC_FIRST_PASS:
-      oxcf->mode = TWO_PASS_FIRST;
       oxcf->pass = 1;
       break;
     case VPX_RC_LAST_PASS:
-      oxcf->mode = TWO_PASS_SECOND_BEST;
       oxcf->pass = 2;
       break;
   }
@@ -371,9 +376,9 @@
   oxcf->scaled_frame_width       = cfg->rc_scaled_width;
   oxcf->scaled_frame_height      = cfg->rc_scaled_height;
 
-  oxcf->maximum_buffer_size_ms   = cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms  = cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
 
@@ -393,7 +398,6 @@
   oxcf->sharpness              =  extra_cfg->sharpness;
 
   oxcf->two_pass_stats_in      =  cfg->rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  extra_cfg->pkt_list;
 
 #if CONFIG_FP_MB_STATS
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
@@ -401,7 +405,6 @@
 
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
-  oxcf->arnr_type       = extra_cfg->arnr_type;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
@@ -428,6 +431,9 @@
     }
   } else if (oxcf->ss_number_layers == 1) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+#if CONFIG_SPATIAL_SVC
+    oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+#endif
   }
 
   oxcf->ts_number_layers = cfg->ts_number_layers;
@@ -597,9 +603,9 @@
 
 static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
-  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
-  return update_extra_cfg(ctx, &extra_cfg);
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
@@ -659,52 +665,36 @@
   (void)data;
 
   if (ctx->priv == NULL) {
-    int i;
-    vpx_codec_enc_cfg_t *cfg;
-    struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv));
-
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
     if (priv == NULL)
       return VPX_CODEC_MEM_ERROR;
 
-    ctx->priv = &priv->base;
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = priv;
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->enc.total_encoders = 1;
 
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
-      ctx->priv->alg_priv->cfg = *ctx->config.enc;
-      ctx->config.enc = &ctx->priv->alg_priv->cfg;
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
     }
 
-    cfg = &ctx->priv->alg_priv->cfg;
-
-    // Select the extra vp6 configuration table based on the current
-    // usage value. If the current usage value isn't found, use the
-    // values for usage case 0.
-    for (i = 0;
-         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         ++i) {}
-
-    priv->extra_cfg = extracfg_map[i].cfg;
-    priv->extra_cfg.pkt_list = &priv->pkt_list.head;
-
+    priv->extra_cfg = default_extra_cfg;
     vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
     if (res == VPX_CODEC_OK) {
-      VP9_COMP *cpi;
-      set_encoder_config(&ctx->priv->alg_priv->oxcf,
-                         &ctx->priv->alg_priv->cfg,
-                         &ctx->priv->alg_priv->extra_cfg);
-      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-      if (cpi == NULL)
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+      priv->cpi = vp9_create_compressor(&priv->oxcf);
+      if (priv->cpi == NULL)
         res = VPX_CODEC_MEM_ERROR;
       else
-        ctx->priv->alg_priv->cpi = cpi;
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
     }
   }
 
@@ -714,35 +704,40 @@
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
   vp9_remove_compressor(ctx->cpi);
-  free(ctx);
+  vpx_free(ctx);
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
                                     unsigned long deadline) {
-  // Use best quality mode if no deadline is given.
-  MODE new_qc = ONE_PASS_BEST;
+  MODE new_mode = BEST;
 
-  if (deadline) {
-    // Convert duration parameter from stream timebase to microseconds
-    const uint64_t duration_us = (uint64_t)duration * 1000000 *
-                               (uint64_t)ctx->cfg.g_timebase.num /
-                               (uint64_t)ctx->cfg.g_timebase.den;
+  switch (ctx->cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      if (deadline > 0) {
+        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
 
-    // If the deadline is more that the duration this frame is to be shown,
-    // use good quality mode. Otherwise use realtime mode.
-    new_qc = (deadline > duration_us) ? ONE_PASS_GOOD : REALTIME;
+        // Convert duration parameter from stream timebase to microseconds.
+        const uint64_t duration_us = (uint64_t)duration * 1000000 *
+           (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+        // If the deadline is more that the duration this frame is to be shown,
+        // use good quality mode. Otherwise use realtime mode.
+        new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+      } else {
+        new_mode = BEST;
+      }
+      break;
+    case VPX_RC_FIRST_PASS:
+      break;
+    case VPX_RC_LAST_PASS:
+      new_mode = deadline > 0 ? GOOD : BEST;
+      break;
   }
 
-  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
-    new_qc = TWO_PASS_FIRST;
-  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
-    new_qc = (new_qc == ONE_PASS_BEST) ? TWO_PASS_SECOND_BEST
-                                          : TWO_PASS_SECOND_GOOD;
-
-  if (ctx->oxcf.mode != new_qc) {
-    ctx->oxcf.mode = new_qc;
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 }
@@ -821,6 +816,23 @@
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY
+#if CONFIG_SPATIAL_SVC
+      || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame)
+#endif
+        )
+    flags |= VPX_FRAME_IS_KEY;
+
+  if (cpi->droppable)
+    flags |= VPX_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                       const vpx_image_t *img,
                                       vpx_codec_pts_t pts,
@@ -828,18 +840,19 @@
                                       vpx_enc_frame_flags_t flags,
                                       unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
+  VP9_COMP *const cpi = ctx->cpi;
   const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
-    if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+    if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
                         get_image_bps(img) / 8 *
-                        (ctx->cpi->multi_arf_allowed ? 8 : 2);
+                        (cpi->multi_arf_allowed ? 8 : 2);
       if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
 
       ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
@@ -859,7 +872,7 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  vp9_apply_encoding_flags(ctx->cpi, flags);
+  vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -871,7 +884,7 @@
   }
 
   // Initialize the encoder instance on the first frame.
-  if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
+  if (res == VPX_CODEC_OK && cpi != NULL) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -882,16 +895,15 @@
 
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
-      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+      cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
-      if (vp9_receive_raw_frame(ctx->cpi, flags,
+      if (vp9_receive_raw_frame(cpi, flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
         res = update_error_state(ctx, &cpi->common.error);
       }
     }
@@ -916,22 +928,21 @@
     }
 
     while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
                                          cx_data, &dst_time_stamp,
                                          &dst_end_time_stamp, !img)) {
       if (size) {
-        VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
         vpx_codec_cx_pkt_t pkt;
 
 #if CONFIG_SPATIAL_SVC
-        if (is_spatial_svc(cpi))
+        if (is_two_pass_svc(cpi))
           cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size;
 #endif
 
         // Pack invisible frames with the next visible frame
-        if (cpi->common.show_frame == 0
+        if (!cpi->common.show_frame
 #if CONFIG_SPATIAL_SVC
-            || (is_spatial_svc(cpi) &&
+            || (is_two_pass_svc(cpi) &&
                 cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
 #endif
             ) {
@@ -951,30 +962,7 @@
         pkt.data.frame.duration =
            (unsigned long)ticks_to_timebase_units(timebase,
                dst_end_time_stamp - dst_time_stamp);
-        pkt.data.frame.flags = lib_flags << 16;
-
-        if (lib_flags & FRAMEFLAGS_KEY
-#if CONFIG_SPATIAL_SVC
-            || (is_spatial_svc(cpi) &&
-                cpi->svc.layer_context[0].is_key_frame)
-#endif
-            )
-          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
-        if (cpi->common.show_frame == 0) {
-          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
-          // This timestamp should be as close as possible to the
-          // prior PTS so that if a decoder uses pts to schedule when
-          // to do this, we start right after last frame was decoded.
-          // Invisible frames have no duration.
-          pkt.data.frame.pts =
-              ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
-          pkt.data.frame.duration = 0;
-        }
-
-        if (cpi->droppable)
-          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -996,15 +984,21 @@
         cx_data += size;
         cx_data_sz -= size;
 #if CONFIG_SPATIAL_SVC
-        if (is_spatial_svc(cpi)) {
-          vpx_codec_cx_pkt_t pkt = {0};
+        if (is_two_pass_svc(cpi)) {
+          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
           int i;
-          pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          vp9_zero(pkt_sizes);
+          vp9_zero(pkt_psnr);
+          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
           for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-            pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
-            cpi->svc.layer_context[i].layer_size = 0;
+            LAYER_CONTEXT *lc = &cpi->svc.layer_context[i];
+            pkt_sizes.data.layer_sizes[i] = lc->layer_size;
+            pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
+            lc->layer_size = 0;
           }
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
         }
 #endif
       }
@@ -1285,6 +1279,9 @@
 
       320,                // g_width
       240,                // g_height
+      VPX_BITS_8,         // g_bit_depth
+      8,                  // g_input_bit_depth
+
       {1, 30},            // g_timebase
 
       0,                  // g_error_resilient
@@ -1345,16 +1342,19 @@
 CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   "WebM Project VP9 Encoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t
   encoder_init,       // vpx_codec_init_fn_t
   encoder_destroy,    // vpx_codec_destroy_fn_t
   encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
   {  // NOLINT
-    NOT_IMPLEMENTED,  // vpx_codec_peek_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_decode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_frame_get_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_set_fb_fn_t
+    NULL,  // vpx_codec_peek_si_fn_t
+    NULL,  // vpx_codec_get_si_fn_t
+    NULL,  // vpx_codec_decode_fn_t
+    NULL,  // vpx_codec_frame_get_fn_t
+    NULL   // vpx_codec_set_fb_fn_t
   },
   {  // NOLINT
     1,                      // 1 cfg map
@@ -1362,8 +1362,8 @@
     encoder_encode,         // vpx_codec_encode_fn_t
     encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
     encoder_set_config,     // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,        // vpx_codec_get_global_headers_fn_t
+    NULL,        // vpx_codec_get_global_headers_fn_t
     encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED         // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL         // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4372ac9..393c66e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -58,29 +58,22 @@
   (void)data;
 
   if (!ctx->priv) {
-    vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
-    if (alg_priv == NULL)
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
       return VPX_CODEC_MEM_ERROR;
 
-    vp9_zero(*alg_priv);
-
-    ctx->priv = (vpx_codec_priv_t *)alg_priv;
-    ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
-    ctx->priv->alg_priv = alg_priv;
-    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+    ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
-    ctx->priv->alg_priv->flushed = 0;
-    ctx->priv->alg_priv->frame_parallel_decode =
-        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
 
-    // Disable frame parallel decoding for now.
-    ctx->priv->alg_priv->frame_parallel_decode = 0;
+    priv->si.sz = sizeof(priv->si);
+    priv->flushed = 0;
+    priv->frame_parallel_decode =
+        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+    priv->frame_parallel_decode = 0;  // Disable for now
 
     if (ctx->config.dec) {
-      // Update the reference to the config structure to an internal copy.
-      ctx->priv->alg_priv->cfg = *ctx->config.dec;
-      ctx->config.dec = &ctx->priv->alg_priv->cfg;
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
     }
   }
 
@@ -332,81 +325,6 @@
   return VPX_CODEC_OK;
 }
 
-static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
-                                  void *decrypt_state,
-                                  const uint8_t *data) {
-  if (decrypt_cb) {
-    uint8_t marker;
-    decrypt_cb(decrypt_state, data, &marker, 1);
-    return marker;
-  }
-  return *data;
-}
-
-static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
-                                              size_t data_sz,
-                                              uint32_t sizes[8], int *count,
-                                              vpx_decrypt_cb decrypt_cb,
-                                              void *decrypt_state) {
-  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
-  // it is a super frame index. If the last byte of real video compression
-  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
-  // not the associated matching marker byte at the front of the index we have
-  // an invalid bitstream and need to return an error.
-
-  uint8_t marker;
-
-  assert(data_sz);
-  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
-  *count = 0;
-
-  if ((marker & 0xe0) == 0xc0) {
-    const uint32_t frames = (marker & 0x7) + 1;
-    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
-
-    // This chunk is marked as having a superframe index but doesn't have
-    // enough data for it, thus it's an invalid superframe index.
-    if (data_sz < index_sz)
-      return VPX_CODEC_CORRUPT_FRAME;
-
-    {
-      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
-                                          data + data_sz - index_sz);
-
-      // This chunk is marked as having a superframe index but doesn't have
-      // the matching marker byte at the front of the index therefore it's an
-      // invalid chunk.
-      if (marker != marker2)
-        return VPX_CODEC_CORRUPT_FRAME;
-    }
-
-    {
-      // Found a valid superframe index.
-      uint32_t i, j;
-      const uint8_t *x = &data[data_sz - index_sz + 1];
-
-      // Frames has a maximum of 8 and mag has a maximum of 4.
-      uint8_t clear_buffer[32];
-      assert(sizeof(clear_buffer) >= frames * mag);
-      if (decrypt_cb) {
-        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
-        x = clear_buffer;
-      }
-
-      for (i = 0; i < frames; ++i) {
-        uint32_t this_sz = 0;
-
-        for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
-        sizes[i] = this_sz;
-      }
-      *count = frames;
-    }
-  }
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
@@ -424,8 +342,8 @@
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                               ctx->decrypt_cb, ctx->decrypt_state);
+  res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK)
     return res;
 
@@ -519,6 +437,7 @@
     // call to get_frame.
     if (!(*iter)) {
       img = &ctx->img;
+      img->bit_depth = (int)ctx->pbi->common.bit_depth;
       *iter = img;
     }
   }
@@ -667,6 +586,23 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+
+  if (bit_depth) {
+    if (ctx->pbi) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
+      *bit_depth = cm->bit_depth;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
                                                   va_list args) {
   ctx->invert_tile_order = va_arg(args, int);
@@ -699,6 +635,7 @@
   {VP8D_GET_FRAME_CORRUPTED,      ctrl_get_frame_corrupted},
   {VP9_GET_REFERENCE,             ctrl_get_reference},
   {VP9D_GET_DISPLAY_SIZE,         ctrl_get_display_size},
+  {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
 
   { -1, NULL},
 };
@@ -723,12 +660,12 @@
   },
   { // NOLINT
     0,
-    NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
-    NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_global_headers_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL,  // vpx_codec_enc_cfg_map_t
+    NULL,  // vpx_codec_encode_fn_t
+    NULL,  // vpx_codec_get_cx_data_fn_t
+    NULL,  // vpx_codec_enc_config_set_fn_t
+    NULL,  // vpx_codec_get_global_headers_fn_t
+    NULL,  // vpx_codec_get_preview_frame_fn_t
+    NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dc46c4e..e450f7b 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -93,10 +93,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
diff --git a/vpx/exports_enc b/vpx/exports_enc
index 07f0280..12e4578 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -8,17 +8,12 @@
 text vpx_codec_set_cx_data_buf
 text vpx_svc_dump_statistics
 text vpx_svc_encode
-text vpx_svc_get_buffer
 text vpx_svc_get_encode_frame_count
-text vpx_svc_get_frame_size
 text vpx_svc_get_message
 text vpx_svc_init
-text vpx_svc_is_keyframe
 text vpx_svc_release
 text vpx_svc_set_keyframe
 text vpx_svc_set_options
 text vpx_svc_set_quantizers
 text vpx_svc_set_scale_factors
 text vpx_svc_get_layer_resolution
-text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 3ca6504..cbfffd0 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -286,8 +286,6 @@
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-#define NOT_IMPLEMENTED 0
-
 /*!\brief Decoder algorithm interface interface
  *
  * All decoders \ref MUST expose a variable of this type.
@@ -337,9 +335,6 @@
  * and the pointer cast to the proper type.
  */
 struct vpx_codec_priv {
-  unsigned int                    sz;
-  vpx_codec_iface_t              *iface;
-  struct vpx_codec_alg_priv      *alg_priv;
   const char                     *err_detail;
   vpx_codec_flags_t               init_flags;
   struct {
@@ -347,8 +342,7 @@
     vpx_codec_priv_cb_pair_t    put_slice_cb;
   } dec;
   struct {
-    int                         tbd;
-    struct vpx_fixed_buf        cx_data_dst_buf;
+    vpx_fixed_buf_t             cx_data_dst_buf;
     unsigned int                cx_data_pad_before;
     unsigned int                cx_data_pad_after;
     vpx_codec_cx_pkt_t          cx_data_pkt;
@@ -369,11 +363,11 @@
 
 #undef VPX_CTRL_USE_TYPE
 #define VPX_CTRL_USE_TYPE(id, typ) \
-  static typ id##__value(va_list args) {return va_arg(args, typ);} \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
 
 #undef VPX_CTRL_USE_TYPE_DEPRECATED
 #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  static typ id##__value(va_list args) {return va_arg(args, typ);} \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
 
 #define CAST(id, arg) id##__value(arg)
 
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 7828615..4c1c04a 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -15,12 +15,12 @@
 
 #include <assert.h>
 #include <math.h>
+#include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
-#define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
@@ -47,8 +47,33 @@
 #define OPTION_BUFFER_SIZE 256
 #define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
 
-static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
-static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
+static const int DEFAULT_QUANTIZER_VALUES[VPX_SS_MAX_LAYERS] = {
+  60, 53, 39, 33, 27
+};
+
+static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = {
+  4, 5, 7, 11, 16
+};
+
+static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = {
+  16, 16, 16, 16, 16
+};
+
+typedef enum {
+  QUANTIZER = 0,
+  BITRATE,
+  SCALE_FACTOR,
+  AUTO_ALT_REF,
+  ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const int option_max_values[ALL_OPTION_TYPES] = {
+  63, INT_MAX, INT_MAX, 1
+};
+
+static const int option_min_values[ALL_OPTION_TYPES] = {
+  0, 0, 1, 0
+};
 
 // One encoded frame
 typedef struct FrameData {
@@ -68,6 +93,7 @@
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
   int quantizer[VPX_SS_MAX_LAYERS];
   int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_SS_MAX_LAYERS];
 
   // accumulated statistics
   double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
@@ -81,71 +107,17 @@
 
   // state variables
   int encode_frame_count;
-  int frame_received;
+  int psnr_pkt_received;
   int frame_within_gop;
   int layers;
   int layer;
   int is_keyframe;
-
-  FrameData *frame_list;
-  FrameData *frame_temp;
-
-  char *rc_stats_buf;
-  size_t rc_stats_buf_size;
-  size_t rc_stats_buf_used;
+  int use_multiple_frame_contexts;
 
   char message_buffer[2048];
   vpx_codec_ctx_t *codec_ctx;
 } SvcInternal;
 
-// create FrameData from encoder output
-static struct FrameData *fd_create(void *buf, size_t size,
-                                   vpx_codec_frame_flags_t flags) {
-  struct FrameData *const frame_data =
-      (struct FrameData *)vpx_malloc(sizeof(*frame_data));
-  if (frame_data == NULL) {
-    return NULL;
-  }
-  frame_data->buf = vpx_malloc(size);
-  if (frame_data->buf == NULL) {
-    vpx_free(frame_data);
-    return NULL;
-  }
-  vpx_memcpy(frame_data->buf, buf, size);
-  frame_data->size = size;
-  frame_data->flags = flags;
-  return frame_data;
-}
-
-// free FrameData
-static void fd_free(struct FrameData *p) {
-  if (p) {
-    if (p->buf)
-      vpx_free(p->buf);
-    vpx_free(p);
-  }
-}
-
-// add FrameData to list
-static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
-  struct FrameData **p = list;
-
-  while (*p != NULL) p = &(*p)->next;
-  *p = layer_data;
-  layer_data->next = NULL;
-}
-
-// free FrameData list
-static void fd_free_list(struct FrameData *list) {
-  struct FrameData *p = list;
-
-  while (p) {
-    list = list->next;
-    fd_free(p);
-    p = list;
-  }
-}
-
 static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
   if (svc_ctx->internal == NULL) {
@@ -196,158 +168,62 @@
   return retval;
 }
 
-static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0;
-  int i, q;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type,
+                                      char *input,
+                                      int *value0,
+                                      int *value1) {
+  if (type == SCALE_FACTOR) {
+    *value0 = strtol(input, &input, 10);
+    if (*input++ != '/')
+      return VPX_CODEC_INVALID_PARAM;
+    *value1 = strtol(input, &input, 10);
 
-  if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
-    input_string = strdup(DEFAULT_QUANTIZER_VALUES);
+    if (*value0 < option_min_values[SCALE_FACTOR] ||
+        *value1 < option_min_values[SCALE_FACTOR] ||
+        *value0 > option_max_values[SCALE_FACTOR] ||
+        *value1 > option_max_values[SCALE_FACTOR])
+      return VPX_CODEC_INVALID_PARAM;
   } else {
-    input_string = strdup(quantizer_values);
+    *value0 = atoi(input);
+    if (*value0 < option_min_values[type] ||
+        *value0 > option_max_values[type])
+      return VPX_CODEC_INVALID_PARAM;
   }
-
-  token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    if (token != NULL) {
-      q = atoi(token);
-      if (q <= 0 || q > 100) {
-        svc_log(svc_ctx, SVC_LOG_ERROR,
-                "svc-quantizer-values: invalid value %s\n", token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
-    } else {
-      q = 0;
-    }
-    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
-  }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: quantizers: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  free(input_string);
-  return res;
+  return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t parse_auto_alt_ref(SvcContext *svc_ctx,
-                                          const char *alt_ref_options) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0, enabled = 0;
-  int i, value;
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-
-  if (alt_ref_options == NULL || strlen(alt_ref_options) == 0) {
-    return VPX_CODEC_INVALID_PARAM;
-  } else {
-    input_string = strdup(alt_ref_options);
-  }
-
-  token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    if (token != NULL) {
-      value = atoi(token);
-      if (value < 0 || value > 1) {
-        svc_log(svc_ctx, SVC_LOG_ERROR,
-                "enable auto alt ref values: invalid value %s\n", token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
-    } else {
-      value = 0;
-    }
-    si->enable_auto_alt_ref[i] = value;
-    if (value > 0)
-      ++enabled;
-  }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: quantizers: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  if (enabled > REF_FRAMES - svc_ctx->spatial_layers) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
-            REF_FRAMES - svc_ctx->spatial_layers, enabled);
-    res = VPX_CODEC_INVALID_PARAM;
-  }
-  free(input_string);
-  return res;
-}
-
-static void log_invalid_scale_factor(SvcContext *svc_ctx, const char *value) {
-  svc_log(svc_ctx, SVC_LOG_ERROR, "svc scale-factors: invalid value %s\n",
-          value);
-}
-
-static vpx_codec_err_t parse_scale_factors(SvcContext *svc_ctx,
-                                           const char *scale_factors) {
-  char *input_string;
-  char *token;
-  const char *delim = ",";
-  char *save_ptr;
-  int found = 0;
+static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
+                                                       LAYER_OPTION_TYPE type,
+                                                       const char *input,
+                                                       int *option0,
+                                                       int *option1) {
   int i;
-  int64_t num, den;
   vpx_codec_err_t res = VPX_CODEC_OK;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  char *save_ptr;
 
-  if (scale_factors == NULL || strlen(scale_factors) == 0) {
-    input_string = strdup(DEFAULT_SCALE_FACTORS);
-  } else {
-    input_string = strdup(scale_factors);
-  }
+  if (input == NULL || option0 == NULL ||
+      (option1 == NULL && type == SCALE_FACTOR))
+    return VPX_CODEC_INVALID_PARAM;
+
+  input_string = strdup(input);
   token = strtok_r(input_string, delim, &save_ptr);
   for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-    num = den = 0;
     if (token != NULL) {
-      num = strtol(token, &token, 10);
-      if (num <= 0) {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
+      res = extract_option(type, token, option0 + i, option1 + i);
+      if (res != VPX_CODEC_OK)
         break;
-      }
-      if (*token++ != '/') {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
-      den = strtol(token, &token, 10);
-      if (den <= 0) {
-        log_invalid_scale_factor(svc_ctx, token);
-        res = VPX_CODEC_INVALID_PARAM;
-        break;
-      }
       token = strtok_r(NULL, delim, &save_ptr);
-      found = i + 1;
+    } else {
+      break;
     }
-    si->scaling_factor_num[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
-        (int)num;
-    si->scaling_factor_den[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
-        (int)den;
   }
-  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
+  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
-            "svc: scale-factors: %d values required, but only %d specified\n",
-            svc_ctx->spatial_layers, found);
+            "svc: layer params type: %d    %d values required, "
+            "but only %d specified\n", type, svc_ctx->spatial_layers, i);
     res = VPX_CODEC_INVALID_PARAM;
   }
   free(input_string);
@@ -366,7 +242,9 @@
   char *option_name;
   char *option_value;
   char *input_ptr;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
+  int i, alt_ref_enabled = 0;
 
   if (options == NULL) return VPX_CODEC_OK;
   input_string = strdup(options);
@@ -382,17 +260,29 @@
       res = VPX_CODEC_INVALID_PARAM;
       break;
     }
-    if (strcmp("layers", option_name) == 0) {
+    if (strcmp("spatial-layers", option_name) == 0) {
       svc_ctx->spatial_layers = atoi(option_value);
+    } else if (strcmp("temporal-layers", option_name) == 0) {
+      svc_ctx->temporal_layers = atoi(option_value);
     } else if (strcmp("scale-factors", option_name) == 0) {
-      res = parse_scale_factors(svc_ctx, option_value);
+      res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR, option_value,
+                                            si->scaling_factor_num,
+                                            si->scaling_factor_den);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value);
+      res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+                                            si->quantizer, NULL);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("auto-alt-refs", option_name) == 0) {
-      res = parse_auto_alt_ref(svc_ctx, option_value);
+      res = parse_layer_options_from_string(svc_ctx, AUTO_ALT_REF, option_value,
+                                            si->enable_auto_alt_ref, NULL);
       if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("bitrates", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, BITRATE, option_value,
+                                            si->bitrates, NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+      si->use_multiple_frame_contexts = atoi(option_value);
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -401,6 +291,22 @@
     option_name = strtok_r(NULL, "=", &input_ptr);
   }
   free(input_string);
+
+  if (si->use_multiple_frame_contexts &&
+      (svc_ctx->spatial_layers > 3 ||
+       svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4))
+    res = VPX_CODEC_INVALID_PARAM;
+
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
+    alt_ref_enabled += si->enable_auto_alt_ref[i];
+  if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
+            "enabled auto alt reference frame, but % layers are enabled\n",
+            REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+
   return res;
 }
 
@@ -436,6 +342,39 @@
   return VPX_CODEC_OK;
 }
 
+void assign_layer_bitrates(const SvcInternal *const si,
+                           vpx_codec_enc_cfg_t *const enc_cfg) {
+  int i;
+
+  if (si->bitrates[0] != 0) {
+    enc_cfg->rc_target_bitrate = 0;
+    for (i = 0; i < si->layers; ++i) {
+      enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
+      enc_cfg->rc_target_bitrate += si->bitrates[i];
+    }
+  } else {
+    float total = 0;
+    float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
+
+    for (i = 0; i < si->layers; ++i) {
+      if (si->scaling_factor_den[i] > 0) {
+        alloc_ratio[i] = (float)(si->scaling_factor_num[i] * 1.0 /
+            si->scaling_factor_den[i]);
+
+        alloc_ratio[i] *= alloc_ratio[i];
+        total += alloc_ratio[i];
+      }
+    }
+
+    for (i = 0; i < si->layers; ++i) {
+      if (total > 0) {
+        enc_cfg->ss_target_bitrate[i] = (unsigned int)
+            (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
+      }
+    }
+  }
+}
+
 vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
@@ -469,55 +408,64 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers);
-  if (res != VPX_CODEC_OK) return res;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+    si->quantizer[i] = DEFAULT_QUANTIZER_VALUES[i];
+    si->scaling_factor_num[i] = DEFAULT_SCALE_FACTORS_NUM[i];
+    si->scaling_factor_den[i] = DEFAULT_SCALE_FACTORS_DEN[i];
+  }
 
-  res = parse_scale_factors(svc_ctx, si->scale_factors);
-  if (res != VPX_CODEC_OK) return res;
+  if (strlen(si->quantizers) > 0) {
+    res = parse_layer_options_from_string(svc_ctx, QUANTIZER, si->quantizers,
+                                          si->quantizer, NULL);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
+  if (strlen(si->scale_factors) > 0) {
+    res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR,
+                                          si->scale_factors,
+                                          si->scaling_factor_num,
+                                          si->scaling_factor_den);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
 
   // Parse aggregate command line options. Options must start with
   // "layers=xx" then followed by other options
   res = parse_options(svc_ctx, si->options);
   if (res != VPX_CODEC_OK) return res;
 
+  if (svc_ctx->spatial_layers < 1)
+    svc_ctx->spatial_layers = 1;
+  if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS)
+    svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS;
+
+  if (svc_ctx->temporal_layers < 1)
+    svc_ctx->temporal_layers = 1;
+  if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
+    svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
+
   si->layers = svc_ctx->spatial_layers;
 
-  // Assign target bitrate for each layer. We calculate the ratio
-  // from the resolution for now.
-  // TODO(Minghai): Optimize the mechanism of allocating bits after
-  // implementing svc two pass rate control.
-  if (si->layers > 1) {
-    float total = 0;
-    float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
-
-    assert(si->layers <= VPX_SS_MAX_LAYERS);
-    for (i = 0; i < si->layers; ++i) {
-      int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers;
-      if (pos < VPX_SS_MAX_LAYERS && si->scaling_factor_den[pos] > 0) {
-        alloc_ratio[i] = (float)(si->scaling_factor_num[pos] * 1.0 /
-            si->scaling_factor_den[pos]);
-
-        alloc_ratio[i] *= alloc_ratio[i];
-        total += alloc_ratio[i];
-      }
-    }
-
-    for (i = 0; i < si->layers; ++i) {
-      if (total > 0) {
-        enc_cfg->ss_target_bitrate[i] = (unsigned int)
-            (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
-      }
-    }
-  }
+  assign_layer_bitrates(si, enc_cfg);
 
 #if CONFIG_SPATIAL_SVC
   for (i = 0; i < si->layers; ++i)
     enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
 #endif
 
+  if (svc_ctx->temporal_layers > 1) {
+    int i;
+    for (i = 0; i < svc_ctx->temporal_layers; ++i) {
+      enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate /
+                                      svc_ctx->temporal_layers;
+      enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i);
+    }
+  }
+
   // modify encoder configuration
   enc_cfg->ss_number_layers = si->layers;
-  enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
+  enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
 
   // TODO(ivanmaltz): determine if these values need to be set explicitly for
   // svc, or if the normal default/override mechanism can be used
@@ -534,7 +482,8 @@
   enc_cfg->rc_buf_initial_sz = 500;
   enc_cfg->rc_buf_optimal_sz = 600;
   enc_cfg->rc_buf_sz = 1000;
-  enc_cfg->g_error_resilient = 1;
+  if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+    enc_cfg->g_error_resilient = 1;
 
   // Initialize codec
   res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
@@ -553,7 +502,7 @@
                                              int layer,
                                              unsigned int *width,
                                              unsigned int *height) {
-  int w, h, index, num, den;
+  int w, h, num, den;
   const SvcInternal *const si = get_const_svc_internal(svc_ctx);
 
   if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL) {
@@ -561,9 +510,8 @@
   }
   if (layer < 0 || layer >= si->layers) return VPX_CODEC_INVALID_PARAM;
 
-  index = layer + VPX_SS_MAX_LAYERS - si->layers;
-  num = si->scaling_factor_num[index];
-  den = si->scaling_factor_den[index];
+  num = si->scaling_factor_num[layer];
+  den = si->scaling_factor_den[layer];
   if (num == 0 || den == 0) return VPX_CODEC_INVALID_PARAM;
 
   w = si->width * num / den;
@@ -581,7 +529,7 @@
 
 static void set_svc_parameters(SvcContext *svc_ctx,
                                vpx_codec_ctx_t *codec_ctx) {
-  int layer, layer_index;
+  int layer;
   vpx_svc_parameters_t svc_params;
   SvcInternal *const si = get_svc_internal(svc_ctx);
 
@@ -595,11 +543,10 @@
                                                    &svc_params.height)) {
     svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
   }
-  layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
 
   if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
-    svc_params.min_quantizer = si->quantizer[layer_index];
-    svc_params.max_quantizer = si->quantizer[layer_index];
+    svc_params.min_quantizer = si->quantizer[layer];
+    svc_params.max_quantizer = si->quantizer[layer];
   } else {
     svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
     svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
@@ -619,14 +566,12 @@
   vpx_codec_err_t res;
   vpx_codec_iter_t iter;
   const vpx_codec_cx_pkt_t *cx_pkt;
-  int layer_for_psnr = 0;
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
 
   svc_log_reset(svc_ctx);
-  si->rc_stats_buf_used = 0;
 
   si->layers = svc_ctx->spatial_layers;
   if (si->encode_frame_count == 0) {
@@ -657,61 +602,37 @@
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-      case VPX_CODEC_CX_FRAME_PKT: {
-        fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
-                                               cx_pkt->data.frame.sz,
-                                               cx_pkt->data.frame.flags));
-
-        svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
-                "pts: %d\n", si->frame_received,
-                (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
-                (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-
-        ++si->frame_received;
-        layer_for_psnr = 0;
-        break;
-      }
-      case VPX_CODEC_PSNR_PKT: {
-        int i;
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
-                cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
-                cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
-        for (i = 0; i < COMPONENTS; i++) {
-          si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
-          si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
-        }
-        ++layer_for_psnr;
-        break;
-      }
-      case VPX_CODEC_STATS_PKT: {
-        size_t new_size = si->rc_stats_buf_used +
-            cx_pkt->data.twopass_stats.sz;
-
-        if (new_size > si->rc_stats_buf_size) {
-          char *p = (char*)realloc(si->rc_stats_buf, new_size);
-          if (p == NULL) {
-            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
-            return VPX_CODEC_MEM_ERROR;
-          }
-          si->rc_stats_buf = p;
-          si->rc_stats_buf_size = new_size;
-        }
-
-        memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
-               cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
-        si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
-        break;
-      }
 #if CONFIG_SPATIAL_SVC
+      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
+        int i;
+        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+          int j;
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].psnr[0],
+                  cx_pkt->data.layer_psnr[i].psnr[1],
+                  cx_pkt->data.layer_psnr[i].psnr[2],
+                  cx_pkt->data.layer_psnr[i].psnr[3]);
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].sse[0],
+                  cx_pkt->data.layer_psnr[i].sse[1],
+                  cx_pkt->data.layer_psnr[i].sse[2],
+                  cx_pkt->data.layer_psnr[i].sse[3]);
+
+          for (j = 0; j < COMPONENTS; ++j) {
+            si->psnr_sum[i][j] +=
+                cx_pkt->data.layer_psnr[i].psnr[j];
+            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
+          }
+        }
+        ++si->psnr_pkt_received;
+        break;
+      }
       case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
         int i;
         for (i = 0; i < si->layers; ++i)
@@ -739,41 +660,12 @@
   return si->message_buffer;
 }
 
-// We will maintain a list of output frame buffers since with lag_in_frame
-// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
-// remove a frame buffer from the list the put it to a temporal pointer, which
-// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
-void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
-
-  if (si->frame_temp)
-    fd_free(si->frame_temp);
-
-  si->frame_temp = si->frame_list;
-  si->frame_list = si->frame_list->next;
-
-  return si->frame_temp->buf;
-}
-
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return si->frame_list->size;
-}
-
 int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
   const SvcInternal *const si = get_const_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return 0;
   return si->encode_frame_count;
 }
 
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
-}
-
 void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return;
@@ -787,7 +679,7 @@
 
 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames, encode_frame_count;
+  int number_of_frames;
   int i, j;
   uint32_t bytes_total = 0;
   double scale[COMPONENTS];
@@ -800,12 +692,11 @@
 
   svc_log_reset(svc_ctx);
 
-  encode_frame_count = si->encode_frame_count;
-  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);
+  number_of_frames = si->psnr_pkt_received;
+  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
 
   svc_log(svc_ctx, SVC_LOG_INFO, "\n");
   for (i = 0; i < si->layers; ++i) {
-    number_of_frames = encode_frame_count;
 
     svc_log(svc_ctx, SVC_LOG_INFO,
             "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
@@ -853,26 +744,8 @@
   // SvcInternal if it was not already allocated
   si = (SvcInternal *)svc_ctx->internal;
   if (si != NULL) {
-    fd_free(si->frame_temp);
-    fd_free_list(si->frame_list);
-    if (si->rc_stats_buf) {
-      free(si->rc_stats_buf);
-    }
     free(si);
     svc_ctx->internal = NULL;
   }
 }
 
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->rc_stats_buf_used;
-}
-
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->rc_stats_buf;
-}
-
-
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index d175eae..5a495ce 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -88,8 +88,7 @@
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
   else {
-    if (ctx->priv->alg_priv)
-      ctx->iface->destroy(ctx->priv->alg_priv);
+    ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv);
 
     ctx->iface = NULL;
     ctx->name = NULL;
@@ -125,7 +124,7 @@
         va_list  ap;
 
         va_start(ap, ctrl_id);
-        res = entry->fn(ctx->priv->alg_priv, ap);
+        res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
         va_end(ap);
         break;
       }
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 4d22a08..802d8ed 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -18,9 +18,13 @@
 
 #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
 
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
 vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_dec_cfg_t  *cfg,
+                                       const vpx_codec_dec_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -54,9 +58,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -97,7 +98,7 @@
     si->w = 0;
     si->h = 0;
 
-    res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si);
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
   }
 
   return SAVE_STATUS(ctx, res);
@@ -118,8 +119,8 @@
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
   else {
-    res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz,
-                                 user_priv, deadline);
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+                                 deadline);
   }
 
   return SAVE_STATUS(ctx, res);
@@ -132,7 +133,7 @@
   if (!ctx || !iter || !ctx->iface || !ctx->priv)
     img = NULL;
   else
-    img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter);
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
 
   return img;
 }
@@ -188,7 +189,7 @@
              !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
     res = VPX_CODEC_ERROR;
   } else {
-    res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
                                     cb_priv);
   }
 
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 6e18bd1..cd10c41 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -15,14 +15,18 @@
  */
 #include <limits.h>
 #include <string.h>
-#include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
 
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
 vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_enc_cfg_t  *cfg,
+                                       const vpx_codec_enc_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -53,9 +57,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -135,9 +136,6 @@
           }
         }
 
-        if (ctx->priv)
-          ctx->priv->iface = ctx->iface;
-
         if (res)
           break;
 
@@ -222,7 +220,7 @@
     FLOATING_POINT_INIT();
 
     if (num_enc == 1)
-      res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
                                    duration, flags, deadline);
     else {
       /* Multi-resolution encoding:
@@ -236,7 +234,7 @@
       if (img) img += num_enc - 1;
 
       for (i = num_enc - 1; i >= 0; i--) {
-        if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
                                           duration, flags, deadline)))
           break;
 
@@ -265,7 +263,7 @@
     else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter);
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
   }
 
   if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
@@ -333,7 +331,7 @@
     else if (!ctx->iface->enc.get_preview)
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      img = ctx->iface->enc.get_preview(ctx->priv->alg_priv);
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
   }
 
   return img;
@@ -351,7 +349,7 @@
     else if (!ctx->iface->enc.get_glob_hdrs)
       ctx->err = VPX_CODEC_INCAPABLE;
     else
-      buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv);
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
   }
 
   return buf;
@@ -367,7 +365,7 @@
   else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
   else
-    res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg);
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
 
   return SAVE_STATUS(ctx, res);
 }
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 8c7e3cf..e58b61e 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -8,38 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdlib.h>
 #include <string.h>
+
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
-
-#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
-
-/* Memalign code is copied from vpx_mem.c */
-static void *img_buf_memalign(size_t align, size_t size) {
-  void *addr,
-       * x = NULL;
-
-  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
-
-  if (addr) {
-    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
-    /* save the actual malloc address */
-    ((size_t *)x)[-1] = (size_t)addr;
-  }
-
-  return x;
-}
-
-static void img_buf_free(void *memblk) {
-  if (memblk) {
-    void *addr = (void *)(((size_t *)memblk)[-1]);
-    free(addr);
-  }
-}
+#include "vpx_mem/vpx_mem.h"
 
 static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
                                      vpx_img_fmt_t  fmt,
@@ -172,7 +146,7 @@
     if (alloc_size != (size_t)alloc_size)
       goto fail;
 
-    img->img_data = img_buf_memalign(buf_align, (size_t)alloc_size);
+    img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
     img->img_data_owner = 1;
   }
 
@@ -180,7 +154,7 @@
     goto fail;
 
   img->fmt = fmt;
-  img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
+  img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
   img->w = w;
   img->h = h;
   img->x_chroma_shift = xcs;
@@ -296,7 +270,7 @@
 void vpx_img_free(vpx_image_t *img) {
   if (img) {
     if (img->img_data && img->img_data_owner)
-      img_buf_free(img->img_data);
+      vpx_free(img->img_data);
 
     if (img->self_allocd)
       free(img);
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index e0de263..0ed2f0f 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -31,7 +31,8 @@
 
 typedef struct {
   // public interface to svc_command options
-  int spatial_layers;               // number of layers
+  int spatial_layers;               // number of spatial layers
+  int temporal_layers;               // number of temporal layers
   SVC_LOG_LEVEL log_level;  // amount of information to display
   int log_print;  // when set, printf log messages instead of returning the
                   // message with svc_get_message
@@ -95,29 +96,6 @@
 const char *vpx_svc_get_message(const SvcContext *svc_ctx);
 
 /**
- * return size of encoded data to be returned by vpx_svc_get_buffer.
- * it needs to be called before vpx_svc_get_buffer.
- */
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer with encoded data. encoder will maintain a list of frame
- * buffers. each call of vpx_svc_get_buffer() will return one frame.
- */
-void *vpx_svc_get_buffer(SvcContext *svc_ctx);
-
-/**
- * return size of two pass rate control stats data to be returned by
- * vpx_svc_get_rc_stats_buffer
- */
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer two pass of rate control stats data
- */
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx);
-
-/**
  * return spatial resolution of the specified layer
  */
 vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
@@ -130,11 +108,6 @@
 int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx);
 
 /**
- * return 1 if last encoded frame was a keyframe
- */
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx);
-
-/**
  * force the next frame to be a keyframe
  */
 void vpx_svc_set_keyframe(SvcContext *svc_ctx);
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index bd7f19c..379b306 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -75,6 +75,9 @@
   /** control function to get the display dimensions for the current frame. */
   VP9D_GET_DISPLAY_SIZE,
 
+  /** control function to get the bit depth of the stream. */
+  VP9D_GET_BIT_DEPTH,
+
   /** For testing. */
   VP9_INVERT_TILE_DECODE_ORDER,
 
@@ -118,6 +121,7 @@
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 07df72a..b25308e 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -203,9 +203,11 @@
     const char              *err_detail;  /**< Detailed info, if available */
     vpx_codec_flags_t        init_flags;  /**< Flags passed at init time */
     union {
-      struct vpx_codec_dec_cfg  *dec;   /**< Decoder Configuration Pointer */
-      struct vpx_codec_enc_cfg  *enc;   /**< Encoder Configuration Pointer */
-      void                      *raw;
+      /**< Decoder Configuration Pointer */
+      const struct vpx_codec_dec_cfg *dec;
+      /**< Encoder Configuration Pointer */
+      const struct vpx_codec_enc_cfg *enc;
+      const void                     *raw;
     }                        config;      /**< Configuration pointer aliasing union */
     vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
   } vpx_codec_ctx_t;
@@ -215,9 +217,9 @@
    * This enumeration determines the bit depth of the codec.
    */
   typedef enum vpx_bit_depth {
-    VPX_BITS_8,   /**< 8 bits  */
-    VPX_BITS_10,  /**< 10 bits */
-    VPX_BITS_12   /**< 12 bits */
+    VPX_BITS_8  =  8,  /**<  8 bits */
+    VPX_BITS_10 = 10,  /**< 10 bits */
+    VPX_BITS_12 = 12,  /**< 12 bits */
   } vpx_bit_depth_t;
 
   /*
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 10b89fa..62fd919 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -135,7 +135,7 @@
    */
   vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_dec_cfg_t  *cfg,
+                                         const vpx_codec_dec_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 7dbbf2f..996fcb4 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -80,6 +80,9 @@
    */
 #define VPX_CODEC_CAP_OUTPUT_PARTITION  0x20000
 
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH  0x40000
 
   /*! \brief Initialization-time Feature Enabling
    *
@@ -91,6 +94,7 @@
 #define VPX_CODEC_USE_PSNR  0x10000 /**< Calculate PSNR on each frame */
 #define VPX_CODEC_USE_OUTPUT_PARTITION  0x20000 /**< Make the encoder output one
   partition at a time. */
+#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 
 
   /*!\brief Generic fixed size buffer structure
@@ -159,6 +163,7 @@
     VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
 #if CONFIG_SPATIAL_SVC
     VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
+    VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
 #endif
     VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
   };
@@ -188,16 +193,17 @@
                                               has id 0.*/
 
       } frame;  /**< data for compressed frame packet */
-      struct vpx_fixed_buf twopass_stats;  /**< data for two-pass packet */
-      struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */
+      vpx_fixed_buf_t twopass_stats;  /**< data for two-pass packet */
+      vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
       struct vpx_psnr_pkt {
         unsigned int samples[4];  /**< Number of samples, total/y/u/v */
         uint64_t     sse[4];      /**< sum squared error, total/y/u/v */
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
-      struct vpx_fixed_buf raw;     /**< data for arbitrary packets */
+      vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
 #if CONFIG_SPATIAL_SVC
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
+      struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif
 
       /* This packet size is fixed to allow codecs to extend this
@@ -324,6 +330,21 @@
      */
     unsigned int           g_h;
 
+    /*!\brief Bit-depth of the codec
+     *
+     * This value identifies the bit_depth of the codec,
+     * Only certain bit-depths are supported as identified in the
+     * vpx_bit_depth_t enum.
+     */
+    vpx_bit_depth_t        g_bit_depth;
+
+    /*!\brief Bit-depth of the input frames
+     *
+     * This value identifies the bit_depth of the input frames in bits.
+     * Note that the frames passed as input to the encoder must have
+     * this bit-depth.
+     */
+    unsigned int           g_input_bit_depth;
 
     /*!\brief Stream timebase units
      *
@@ -452,14 +473,14 @@
      * A buffer containing all of the stats packets produced in the first
      * pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_twopass_stats_in;
+    vpx_fixed_buf_t   rc_twopass_stats_in;
 
     /*!\brief first pass mb stats buffer.
      *
      * A buffer containing all of the first pass mb stats packets produced
      * in the first pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_firstpass_mb_stats_in;
+    vpx_fixed_buf_t   rc_firstpass_mb_stats_in;
 
     /*!\brief Target data rate
      *
@@ -715,7 +736,7 @@
    */
   vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_enc_cfg_t  *cfg,
+                                         const vpx_codec_enc_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 7b04b70..ef6d1dd 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -31,10 +31,10 @@
 #define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
 
 
-#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
-#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U plane in memory */
-#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel component */
-#define VPX_IMG_FMT_HIGH       0x800  /**< Image uses 16bit framebuffer */
+#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800  /**< Image uses 16bit framebuffer. */
 
   /*!\brief List of supported image formats */
   typedef enum vpx_img_fmt {
@@ -59,45 +59,11 @@
     VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
     VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
     VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
-    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH,
-    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH,
-    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH
+    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
-#define IMG_FMT_UV_FLIP        VPX_IMG_FMT_UV_FLIP    /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
-#define IMG_FMT_HAS_ALPHA      VPX_IMG_FMT_HAS_ALPHA  /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */
-
-  /*!\brief Deprecated list of supported image formats
-   * \deprecated New code should use #vpx_img_fmt
-   */
-#define img_fmt   vpx_img_fmt
-  /*!\brief alias for enum img_fmt.
-   * \deprecated New code should use #vpx_img_fmt_t
-   */
-#define img_fmt_t vpx_img_fmt_t
-
-#define IMG_FMT_NONE       VPX_IMG_FMT_NONE       /**< \deprecated Use #VPX_IMG_FMT_NONE */
-#define IMG_FMT_RGB24      VPX_IMG_FMT_RGB24      /**< \deprecated Use #VPX_IMG_FMT_RGB24 */
-#define IMG_FMT_RGB32      VPX_IMG_FMT_RGB32      /**< \deprecated Use #VPX_IMG_FMT_RGB32 */
-#define IMG_FMT_RGB565     VPX_IMG_FMT_RGB565     /**< \deprecated Use #VPX_IMG_FMT_RGB565 */
-#define IMG_FMT_RGB555     VPX_IMG_FMT_RGB555     /**< \deprecated Use #VPX_IMG_FMT_RGB555 */
-#define IMG_FMT_UYVY       VPX_IMG_FMT_UYVY       /**< \deprecated Use #VPX_IMG_FMT_UYVY */
-#define IMG_FMT_YUY2       VPX_IMG_FMT_YUY2       /**< \deprecated Use #VPX_IMG_FMT_YUY2 */
-#define IMG_FMT_YVYU       VPX_IMG_FMT_YVYU       /**< \deprecated Use #VPX_IMG_FMT_YVYU */
-#define IMG_FMT_BGR24      VPX_IMG_FMT_BGR24      /**< \deprecated Use #VPX_IMG_FMT_BGR24 */
-#define IMG_FMT_RGB32_LE   VPX_IMG_FMT_RGB32_LE   /**< \deprecated Use #VPX_IMG_FMT_RGB32_LE */
-#define IMG_FMT_ARGB       VPX_IMG_FMT_ARGB       /**< \deprecated Use #VPX_IMG_FMT_ARGB */
-#define IMG_FMT_ARGB_LE    VPX_IMG_FMT_ARGB_LE    /**< \deprecated Use #VPX_IMG_FMT_ARGB_LE */
-#define IMG_FMT_RGB565_LE  VPX_IMG_FMT_RGB565_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB565_LE */
-#define IMG_FMT_RGB555_LE  VPX_IMG_FMT_RGB555_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB555_LE */
-#define IMG_FMT_YV12       VPX_IMG_FMT_YV12       /**< \deprecated Use #VPX_IMG_FMT_YV12 */
-#define IMG_FMT_I420       VPX_IMG_FMT_I420       /**< \deprecated Use #VPX_IMG_FMT_I420 */
-#define IMG_FMT_VPXYV12    VPX_IMG_FMT_VPXYV12    /**< \deprecated Use #VPX_IMG_FMT_VPXYV12 */
-#define IMG_FMT_VPXI420    VPX_IMG_FMT_VPXI420    /**< \deprecated Use #VPX_IMG_FMT_VPXI420 */
-#endif /* VPX_CODEC_DISABLE_COMPAT */
-
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
@@ -121,13 +87,6 @@
 #define VPX_PLANE_U      1   /**< U (Chroma) plane */
 #define VPX_PLANE_V      2   /**< V (Chroma) plane */
 #define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define PLANE_PACKED     VPX_PLANE_PACKED
-#define PLANE_Y          VPX_PLANE_Y
-#define PLANE_U          VPX_PLANE_U
-#define PLANE_V          VPX_PLANE_V
-#define PLANE_ALPHA      VPX_PLANE_ALPHA
-#endif
     unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
     int      stride[4];  /**< stride between rows for each plane */
 
diff --git a/vpx_mem/vpx_mem.c b/vpx_mem/vpx_mem.c
index 059248b..da61642 100644
--- a/vpx_mem/vpx_mem.c
+++ b/vpx_mem/vpx_mem.c
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "include/vpx_mem_intrnl.h"
+#include "vpx/vpx_integer.h"
 
 #if CONFIG_MEM_TRACKER
 #ifndef VPX_NO_GLOBALS
@@ -452,6 +453,29 @@
   return VPX_MEMSET_L(dest, val, length);
 }
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+void *vpx_memset16(void *dest, int val, size_t length) {
+#if CONFIG_MEM_CHECKS
+  if ((int)dest < 0x4000) {
+    _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n",
+              (int)dest, val, length);)
+
+#if defined(VXWORKS)
+    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
+
+    vx_sleep(10000);
+#endif
+  }
+#endif
+  int i;
+  void *orig = dest;
+  uint16_t *dest16 = dest;
+  for (i = 0; i < length; i++)
+    *dest16++ = val;
+  return orig;
+}
+#endif  // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+
 void *vpx_memmove(void *dest, const void *src, size_t count) {
 #if CONFIG_MEM_CHECKS
 
diff --git a/vpx_mem/vpx_mem.h b/vpx_mem/vpx_mem.h
index 33686b2..e2391f4 100644
--- a/vpx_mem/vpx_mem.h
+++ b/vpx_mem/vpx_mem.h
@@ -73,6 +73,9 @@
 
   void *vpx_memcpy(void *dest, const void *src, size_t length);
   void *vpx_memset(void *dest, int val, size_t length);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  void *vpx_memset16(void *dest, int val, size_t length);
+#endif
   void *vpx_memmove(void *dest, const void *src, size_t count);
 
   /* special memory functions */
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index fa0e030..f03feff 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -10,7 +10,8 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "arm.h"
+#include "vpx_ports/arm.h"
+#include "./vpx_config.h"
 
 #ifdef WINAPI_FAMILY
 #include <winapifamily.h>
@@ -54,9 +55,9 @@
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
   flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON  || HAVE_NEON_ASM */
   return flags & mask;
 }
 
@@ -87,6 +88,7 @@
       /*Ignore exception.*/
     }
   }
+#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   if (mask & HAS_MEDIA)
     __try {
@@ -97,7 +99,8 @@
     /*Ignore exception.*/
   }
 }
-#if HAVE_NEON
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
 if (mask &HAS_NEON) {
   __try {
     /*VORR q0,q0,q0*/
@@ -107,9 +110,7 @@
     /*Ignore exception.*/
   }
 }
-#endif /* HAVE_NEON */
-#endif /* HAVE_MEDIA */
-#endif /* HAVE_EDSP */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
 return flags & mask;
 }
 
@@ -132,10 +133,10 @@
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
   if (features & ANDROID_CPU_ARM_FEATURE_NEON)
     flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
   return flags & mask;
 }
 
@@ -162,7 +163,7 @@
      */
     char buf[512];
     while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_EDSP || HAVE_NEON
+#if HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM
       if (memcmp(buf, "Features", 8) == 0) {
         char *p;
 #if HAVE_EDSP
@@ -170,15 +171,15 @@
         if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
           flags |= HAS_EDSP;
         }
-#if HAVE_NEON
+#endif /* HAVE_EDSP */
+#if HAVE_NEON || HAVE_NEON_ASM
         p = strstr(buf, " neon");
         if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
           flags |= HAS_NEON;
         }
-#endif /* HAVE_NEON */
-#endif /* HAVE_EDSP */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
       }
-#endif /* HAVE_EDSP || HAVE_NEON */
+#endif /* HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM */
 #if HAVE_MEDIA
       if (memcmp(buf, "CPU architecture:", 17) == 0) {
         int version;
diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index 870338b..dd98e29 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h
@@ -11,6 +11,9 @@
 
 #ifndef VPX_PORTS_VPX_TIMER_H_
 #define VPX_PORTS_VPX_TIMER_H_
+
+#include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
 
 #if CONFIG_OS_SUPPORT
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 827bce7..70d7ac0 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -13,6 +13,9 @@
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 
 /****************************************************************************
 *  Exports
@@ -136,7 +139,11 @@
 
 int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height,
-                             int ss_x, int ss_y, int border,
+                             int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
+                             int border,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
                              void *cb_priv) {
@@ -161,11 +168,21 @@
     const int alpha_border_h = border;
     const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
                                       (uint64_t)alpha_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint64_t frame_size = (1 + use_highbitdepth) *
+        (yplane_size + 2 * uvplane_size + alpha_plane_size);
+#else
     const uint64_t frame_size = yplane_size + 2 * uvplane_size +
                                 alpha_plane_size;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint64_t frame_size =
+        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
 #else
     const uint64_t frame_size = yplane_size + 2 * uvplane_size;
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_ALPHA
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -231,11 +248,31 @@
     ybf->border = border;
     ybf->frame_size = (int)frame_size;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_highbitdepth) {
+      // Store uint16 addresses when using 16bit framebuffers
+      uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+      ybf->y_buffer = p + (border * y_stride) + border;
+      ybf->u_buffer = p + yplane_size +
+          (uv_border_h * uv_stride) + uv_border_w;
+      ybf->v_buffer = p + yplane_size + uvplane_size +
+          (uv_border_h * uv_stride) + uv_border_w;
+      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+    } else {
+      ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+      ybf->u_buffer = ybf->buffer_alloc + yplane_size +
+          (uv_border_h * uv_stride) + uv_border_w;
+      ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
+          (uv_border_h * uv_stride) + uv_border_w;
+      ybf->flags = 0;
+    }
+#else
     ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
     ybf->u_buffer = ybf->buffer_alloc + yplane_size +
                     (uv_border_h * uv_stride) + uv_border_w;
     ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
                     (uv_border_h * uv_stride) + uv_border_w;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_ALPHA
     ybf->alpha_width = alpha_width;
@@ -252,11 +289,18 @@
 
 int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                            int width, int height,
-                           int ss_x, int ss_y, int border) {
+                           int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
+                           int border) {
   if (ybf) {
     vp9_free_frame_buffer(ybf);
-    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
-                                    NULL, NULL, NULL);
+    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                    use_highbitdepth,
+#endif
+                                    border, NULL, NULL, NULL);
   }
   return -2;
 }
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 036a505..0485452 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -13,6 +13,9 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
 
 static void extend_plane(uint8_t *const src, int src_stride,
                          int width, int height,
@@ -55,6 +58,50 @@
   }
 }
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride,
+                              int width, int height,
+                              int extend_top, int extend_left,
+                              int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  /* copy the left and right most columns out */
+  uint16_t *src_ptr1 = src;
+  uint16_t *src_ptr2 = src + width - 1;
+  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += src_stride;
+  }
+}
+#endif
+
 void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   const int uv_border = ybf->border / 2;
 
@@ -64,6 +111,31 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(
+        ybf->y_buffer, ybf->y_stride,
+        ybf->y_crop_width, ybf->y_crop_height,
+        ybf->border, ybf->border,
+        ybf->border + ybf->y_height - ybf->y_crop_height,
+        ybf->border + ybf->y_width - ybf->y_crop_width);
+
+    extend_plane_high(
+        ybf->u_buffer, ybf->uv_stride,
+        (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+        ybf->border / 2, ybf->border / 2,
+        (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+        (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+
+    extend_plane_high(
+        ybf->v_buffer, ybf->uv_stride,
+        (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+        ybf->border / 2, ybf->border / 2,
+        (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+        (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+    return;
+  }
+#endif
   extend_plane(ybf->y_buffer, ybf->y_stride,
                ybf->y_crop_width, ybf->y_crop_height,
                ybf->border, ybf->border,
@@ -99,6 +171,20 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride,
+                      ybf->y_crop_width, ybf->y_crop_height,
+                      ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    extend_plane_high(ybf->u_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    extend_plane_high(ybf->v_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    return;
+  }
+#endif
   extend_plane(ybf->y_buffer, ybf->y_stride,
                ybf->y_crop_width, ybf->y_crop_height,
                ext_size, ext_size,
@@ -121,6 +207,14 @@
                        VP9INNERBORDERINPIXELS : ybf->border;
   extend_frame(ybf, inner_bw);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  vpx_memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP9
 
 // Copies the source image into the destination image and updates the
@@ -140,6 +234,40 @@
   assert(src_ybc->y_height == dst_ybc->y_height);
 #endif
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->y_width);
+      src += src_ybc->y_stride;
+      dst += dst_ybc->y_stride;
+    }
+
+    src = src_ybc->u_buffer;
+    dst = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    src = src_ybc->v_buffer;
+    dst = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    return;
+  } else {
+    assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
+  }
+#endif
+
   for (row = 0; row < src_ybc->y_height; ++row) {
     vpx_memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
@@ -173,6 +301,19 @@
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+#endif
+
   for (row = 0; row < src_ybc->y_height; ++row) {
     vpx_memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index cdde75c..9ff764c 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -15,6 +15,7 @@
 extern "C" {
 #endif
 
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_frame_buffer.h"
 #include "vpx/vpx_integer.h"
 
@@ -50,11 +51,14 @@
   int buffer_alloc_sz;
   int border;
   int frame_size;
+  unsigned int bit_depth;
 
   int corrupted;
   int flags;
 } YV12_BUFFER_CONFIG;
 
+#define YV12_FLAG_HIGHBITDEPTH 1
+
 int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                 int width, int height, int border);
 int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
@@ -63,6 +67,9 @@
 
 int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                            int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
                            int border);
 
 // Updates the yv12 buffer config with the frame buffer. If cb is not
@@ -73,6 +80,9 @@
 // on failure.
 int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
                              int border,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
diff --git a/vpxdec.c b/vpxdec.c
index faee42a..091522f 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -15,13 +15,15 @@
 #include <string.h>
 #include <limits.h>
 
+#include "./vpx_config.h"
+
+#if CONFIG_LIBYUV
 #include "third_party/libyuv/include/libyuv/scale.h"
+#endif
 
 #include "./args.h"
 #include "./ivfdec.h"
 
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
@@ -87,12 +89,20 @@
 
 static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
                                         "Compute the MD5 sum of the decoded frame");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t outbitdeptharg = ARG_DEF(
+    NULL, "output-bit-depth", 1,
+    "Output bit-depth for decoded frames");
+#endif
 
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
   &threadsarg, &verbosearg, &scalearg, &fb_arg,
   &md5arg, &error_concealment, &continuearg,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  &outbitdeptharg,
+#endif
   NULL
 };
 
@@ -123,8 +133,29 @@
 };
 #endif
 
+#if CONFIG_LIBYUV
 static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
                                   FilterModeEnum mode) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (src->fmt == VPX_IMG_FMT_I42016) {
+    assert(dst->fmt == VPX_IMG_FMT_I42016);
+    return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
+                        src->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_U],
+                        src->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_V],
+                        src->stride[VPX_PLANE_V]/2,
+                        src->d_w, src->d_h,
+                        (uint16_t*)dst->planes[VPX_PLANE_Y],
+                        dst->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_U],
+                        dst->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_V],
+                        dst->stride[VPX_PLANE_V]/2,
+                        dst->d_w, dst->d_h,
+                        mode);
+  }
+#endif
   assert(src->fmt == VPX_IMG_FMT_I420);
   assert(dst->fmt == VPX_IMG_FMT_I420);
   return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
@@ -137,6 +168,7 @@
                    dst->d_w, dst->d_h,
                    mode);
 }
+#endif
 
 void usage_exit() {
   int i;
@@ -260,6 +292,11 @@
 static void write_image_file(const vpx_image_t *img, const int planes[3],
                              FILE *file) {
   int i, y;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+#else
+  const int bytes_per_sample = 1;
+#endif
 
   for (i = 0; i < 3; ++i) {
     const int plane = planes[i];
@@ -269,7 +306,7 @@
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytes_per_sample, w, file);
       buf += stride;
     }
   }
@@ -489,6 +526,178 @@
   }
 }
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                             int input_shift) {
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+                                     y * src->stride[plane]);
+      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+                                     y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                            int input_shift) {
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+                                     y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                        int input_shift) {
+  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    high_img_upshift(dst, src, input_shift);
+  } else {
+    low_img_upshift(dst, src, input_shift);
+  }
+}
+
+static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                               int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+                                     y * src->stride[plane]);
+      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+                                     y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                            int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+                                     y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+static void img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                          int down_shift) {
+  if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    high_img_downshift(dst, src, down_shift);
+  } else {
+    low_img_downshift(dst, src, down_shift);
+  }
+}
+#endif
+
 int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
@@ -513,6 +722,9 @@
   int                     opt_yv12 = 0;
   int                     opt_i420 = 0;
   vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  int                     output_bit_depth = 0;
+#endif
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
   int                     vp8_dbg_color_ref_frame = 0;
@@ -524,6 +736,9 @@
   int                     dec_flags = 0;
   int                     do_scale = 0;
   vpx_image_t             *scaled_img = NULL;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t             *img_shifted = NULL;
+#endif
   int                     frame_avail, got_data;
   int                     num_external_frame_buffers = 0;
   struct ExternalFrameBufferList ext_fb_list = {0, NULL};
@@ -538,7 +753,8 @@
   struct VpxDecInputContext input = {NULL, NULL};
   struct VpxInputContext vpx_input_ctx;
 #if CONFIG_WEBM_IO
-  struct WebmInputContext webm_ctx = {0};
+  struct WebmInputContext webm_ctx;
+  memset(&(webm_ctx), 0, sizeof(webm_ctx));
   input.webm_ctx = &webm_ctx;
 #endif
   input.vpx_input_ctx = &vpx_input_ctx;
@@ -563,6 +779,9 @@
       use_y4m = 0;
       flipuv = 1;
       opt_yv12 = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+      output_bit_depth = 8;  // For yv12 8-bit depth output is assumed
+#endif
     } else if (arg_match(&arg, &use_i420, argi)) {
       use_y4m = 0;
       flipuv = 0;
@@ -593,7 +812,13 @@
       do_scale = 1;
     else if (arg_match(&arg, &fb_arg, argi))
       num_external_frame_buffers = arg_parse_uint(&arg);
-
+    else if (arg_match(&arg, &continuearg, argi))
+      keep_going = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    else if (arg_match(&arg, &outbitdeptharg, argi)) {
+      output_bit_depth = arg_parse_uint(&arg);
+    }
+#endif
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -643,11 +868,8 @@
       }
     } else if (arg_match(&arg, &error_concealment, argi)) {
       ec_enabled = 1;
-    } else if (arg_match(&arg, &continuearg, argi)) {
-      keep_going = 1;
     }
-
-#endif
+#endif  // CONFIG_VP8_DECODER
     else
       argj++;
   }
@@ -883,7 +1105,7 @@
               display_height = display_size[1];
             }
           }
-          scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, display_width,
+          scaled_img = vpx_img_alloc(NULL, img->fmt, display_width,
                                      display_height, 16);
           scaled_img->bit_depth = img->bit_depth;
         }
@@ -901,6 +1123,33 @@
 #endif
         }
       }
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+      // Default to codec bit depth if output bit depth not set
+      if (!output_bit_depth) {
+        output_bit_depth = img->bit_depth;
+      }
+      // Shift up or down if necessary
+      if (output_bit_depth != img->bit_depth) {
+        if (!img_shifted) {
+          if (output_bit_depth == 8) {
+            img_shifted = vpx_img_alloc(
+                NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                img->d_w, img->d_h, 16);
+          } else {
+            img_shifted = vpx_img_alloc(
+                NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+                img->d_w, img->d_h, 16);
+          }
+          img_shifted->bit_depth = output_bit_depth;
+        }
+        if (output_bit_depth > img->bit_depth) {
+          img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+        } else {
+          img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+        }
+        img = img_shifted;
+      }
+#endif
 
       if (single_file) {
         if (use_y4m) {
@@ -1007,6 +1256,9 @@
     free(buf);
 
   if (scaled_img) vpx_img_free(scaled_img);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (img_shifted) vpx_img_free(img_shifted);
+#endif
 
   for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
     free(ext_fb_list.ext_fb[i].data);
diff --git a/vpxenc.c b/vpxenc.c
index 7e037a6..5afca24 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -19,12 +19,15 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
 #include "vpx/vpx_encoder.h"
 #if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
 #endif
 
-#include "third_party/libyuv/include/libyuv/scale.h"
 #include "./args.h"
 #include "./ivfenc.h"
 #include "./tools_common.h"
@@ -197,6 +200,10 @@
     ARG_DEF(NULL, "experimental-bitstream", 0,
             "Allow experimental bitstream features.");
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t test16bitinternalarg = ARG_DEF(
+    NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
+#endif
 
 static const arg_def_t *main_args[] = {
   &debugmode,
@@ -245,6 +252,9 @@
 #endif
   &timebase, &framerate,
   &error_resilient,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  &test16bitinternalarg,
+#endif
   &lag_in_frames, NULL
 };
 
@@ -318,7 +328,7 @@
 static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
                                             "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
-                                           "Filter sharpness (0-7)");
+                                           "Loop filter sharpness (0..7)");
 static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
                                                "Motion detection threshold");
 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
@@ -326,11 +336,11 @@
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                              "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
-                                                "AltRef Max Frames");
+                                                "AltRef max frames (0..15)");
 static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
-                                               "AltRef Strength");
+                                               "AltRef filter strength (0..6)");
 static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
-                                           "AltRef Type");
+                                           "AltRef type");
 static const struct arg_enum_list tuning_enum[] = {
   {"psnr", VP8_TUNE_PSNR},
   {"ssim", VP8_TUNE_SSIM},
@@ -375,9 +385,26 @@
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
     "3: cyclic refresh)");
 static const arg_def_t frame_periodic_boost = ARG_DEF(
-    NULL, "frame_boost", 1,
+    NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdeptharg   = ARG_DEF_ENUM("b", "bit-depth", 1,
+                                                    "Bit depth for codec "
+                                                    "(8 for version <=1, "
+                                                    "10 or 12 for version 2)",
+                                                    bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1,
+                                               "Bit depth of input");
+#endif
+
 static const struct arg_enum_list tune_content_enum[] = {
   {"default", VP9E_CONTENT_DEFAULT},
   {"screen", VP9E_CONTENT_SCREEN},
@@ -392,6 +419,9 @@
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
   &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  &bitdeptharg, &inbitdeptharg,
+#endif
   NULL
 };
 static const int vp9_arg_ctrl_map[] = {
@@ -447,6 +477,102 @@
 }
 
 #define mmin(a, b)  ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void find_mismatch_high(const vpx_image_t *const img1,
+                               const vpx_image_t *const img2,
+                               int yloc[4], int uloc[4], int vloc[4]) {
+  uint16_t *plane1, *plane2;
+  uint32_t stride1, stride2;
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y]/2;
+  stride2 = img2->stride[VPX_PLANE_Y]/2;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U]/2;
+  stride2 = img2->stride[VPX_PLANE_U]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V]/2;
+  stride2 = img2->stride[VPX_PLANE_V]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 static void find_mismatch(const vpx_image_t *const img1,
                           const vpx_image_t *const img2,
                           int yloc[4], int uloc[4], int vloc[4]) {
@@ -539,7 +665,8 @@
 
 static int compare_img(const vpx_image_t *const img1,
                        const vpx_image_t *const img2) {
-  const uint32_t c_w =
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
       (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
   const uint32_t c_h =
       (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
@@ -549,11 +676,17 @@
   match &= (img1->fmt == img2->fmt);
   match &= (img1->d_w == img2->d_w);
   match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+#endif
 
   for (i = 0; i < img1->d_h; ++i)
     match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
                      img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     img1->d_w) == 0);
+                     l_w) == 0);
 
   for (i = 0; i < c_h; ++i)
     match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
@@ -598,6 +731,10 @@
   int                       arg_ctrl_cnt;
   int                       write_webm;
   int                       have_kf_max_dist;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  // whether to use 16bit internal buffers
+  int                       use_16bit_internal;
+#endif
 };
 
 
@@ -737,8 +874,9 @@
 #if CONFIG_VP9_ENCODER
     // Make default VP9 passes = 2 until there is a better quality 1-pass
     // encoder
-    global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
-                      global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                        global->deadline != VPX_DL_REALTIME) ? 2 : 1;
 #else
     global->passes = 1;
 #endif
@@ -806,8 +944,10 @@
   struct stream_state *stream;
 
   stream = calloc(1, sizeof(*stream));
-  if (!stream)
+  if (stream == NULL) {
     fatal("Failed to allocate new stream.");
+  }
+
   if (prev) {
     memcpy(stream, prev, sizeof(*stream));
     stream->index++;
@@ -867,6 +1007,9 @@
   static const int        *ctrl_args_map = NULL;
   struct stream_config    *config = &stream->config;
   int                      eos_mark_found = 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  int                      test_16bit_internal = 0;
+#endif
 
   // Handle codec specific options
   if (0) {
@@ -915,6 +1058,12 @@
       config->cfg.g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+      config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+      config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+#endif
 #if CONFIG_WEBM_IO
     } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -982,6 +1131,12 @@
       config->have_kf_max_dist = 1;
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
+      if (strcmp(global->codec->name, "vp9") == 0) {
+        test_16bit_internal = 1;
+      }
+#endif
     } else {
       int i, match = 0;
       for (i = 0; ctrl_args[i]; i++) {
@@ -993,12 +1148,13 @@
           * instance of this control.
           */
           for (j = 0; j < config->arg_ctrl_cnt; j++)
-            if (config->arg_ctrls[j][0] == ctrl_args_map[i])
+            if (ctrl_args_map != NULL &&
+                config->arg_ctrls[j][0] == ctrl_args_map[i])
               break;
 
           /* Update/insert */
           assert(j < (int)ARG_CTRL_CNT_MAX);
-          if (j < (int)ARG_CTRL_CNT_MAX) {
+          if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
             config->arg_ctrls[j][0] = ctrl_args_map[i];
             config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
             if (j == config->arg_ctrl_cnt)
@@ -1011,6 +1167,12 @@
         argj++;
     }
   }
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (strcmp(global->codec->name, "vp9") == 0) {
+    config->use_16bit_internal = test_16bit_internal |
+                                 (config->cfg.g_profile > 1);
+  }
+#endif
   return eos_mark_found;
 }
 
@@ -1038,6 +1200,14 @@
           experimental_bitstream.long_name);
   }
 
+  // Check that the codec bit depth is greater than the input bit depth.
+  if (stream->config.cfg.g_input_bit_depth >
+      (unsigned int)stream->config.cfg.g_bit_depth) {
+    fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+          stream->index, (int)stream->config.cfg.g_bit_depth,
+          stream->config.cfg.g_input_bit_depth);
+  }
+
   for (streami = stream; streami; streami = streami->next) {
     /* All streams require output files */
     if (!streami->config.out_fn)
@@ -1146,6 +1316,8 @@
   SHOW(g_profile);
   SHOW(g_w);
   SHOW(g_h);
+  SHOW(g_bit_depth);
+  SHOW(g_input_bit_depth);
   SHOW(g_timebase.num);
   SHOW(g_timebase.den);
   SHOW(g_error_resilient);
@@ -1278,6 +1450,9 @@
 
   flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
   flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0;
+#endif
 
   /* Construct Encoder Context */
   vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
@@ -1323,6 +1498,46 @@
                      / cfg->g_timebase.num / global->framerate.num;
 
   /* Scale if necessary */
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (img) {
+    if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
+        (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+      if (img->fmt != VPX_IMG_FMT_I42016) {
+        fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+        exit(EXIT_FAILURE);
+      }
+#if CONFIG_LIBYUV
+      if (!stream->img) {
+        stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
+                                    cfg->g_w, cfg->g_h, 16);
+      }
+      I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
+                   img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)img->planes[VPX_PLANE_U],
+                   img->stride[VPX_PLANE_U]/2,
+                   (uint16*)img->planes[VPX_PLANE_V],
+                   img->stride[VPX_PLANE_V]/2,
+                   img->d_w, img->d_h,
+                   (uint16*)stream->img->planes[VPX_PLANE_Y],
+                   stream->img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_U],
+                   stream->img->stride[VPX_PLANE_U]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_V],
+                   stream->img->stride[VPX_PLANE_V]/2,
+                   stream->img->d_w, stream->img->d_h,
+                   kFilterBox);
+      img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+    }
+  }
+#endif
   if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
     if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
       fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
@@ -1501,6 +1716,131 @@
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                             int input_shift) {
+  // Note the offset is 1 less than half
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->w != src->w || dst->h != src->h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->w;
+    int h = src->h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+                                     y * src->stride[plane]);
+      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+                                     y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                            int input_shift) {
+  // Note the offset is 1 less than half
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->w != src->w || dst->h != src->h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->w;
+    int h = src->h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+                                     y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                        int input_shift) {
+  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    high_img_upshift(dst, src, input_shift);
+  } else {
+    low_img_upshift(dst, src, input_shift);
+  }
+}
+
+static void img_cast_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+  int plane;
+  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+      dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w >>= src->x_chroma_shift;
+      h >>= src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+                                     y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++;
+      }
+    }
+  }
+}
+#endif
 
 static void test_decode(struct stream_state  *stream,
                         enum TestDecodeFatality fatal,
@@ -1527,20 +1867,44 @@
     vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
     vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
   } else {
-    struct vp9_ref_frame ref;
+    struct vp9_ref_frame ref_enc, ref_dec;
 
-    ref.idx = 0;
-    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);
-    enc_img = ref.img;
-    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);
-    dec_img = ref.img;
+    ref_enc.idx = 0;
+    ref_dec.idx = 0;
+    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
+    enc_img = ref_enc.img;
+    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
+    dec_img = ref_dec.img;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+        (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+      if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      enc_img.d_w, enc_img.d_h, 16);
+        img_cast_16_to_8(&enc_img, &ref_enc.img);
+      }
+      if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      dec_img.d_w, dec_img.d_h, 16);
+        img_cast_16_to_8(&dec_img, &ref_dec.img);
+      }
+    }
+#endif
   }
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
   if (!compare_img(&enc_img, &dec_img)) {
     int y[4], u[4], v[4];
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
     find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
     stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d at"
@@ -1582,6 +1946,12 @@
 int main(int argc, const char **argv_) {
   int pass;
   vpx_image_t raw;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t raw_shift;
+  int allocated_raw_shift = 0;
+  int use_16bit_internal = 0;
+  int input_shift = 0;
+#endif
   int frame_avail, got_data;
 
   struct VpxInputContext input;
@@ -1683,6 +2053,27 @@
     if (!input.width || !input.height)
       fatal("Specify stream dimensions with --width (-w) "
             " and --height (-h)");
+
+    /* If input file does not specify bit-depth but input-bit-depth parameter
+     * exists, assume that to be the input bit-depth. However, if the
+     * input-bit-depth paramter does not exist, assume the input bit-depth
+     * to be the same as the codec bit-depth.
+     */
+    if (!input.bit_depth) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_input_bit_depth)
+          input.bit_depth = stream->config.cfg.g_input_bit_depth;
+        else
+          input.bit_depth = stream->config.cfg.g_input_bit_depth =
+              (int)stream->config.cfg.g_bit_depth;
+      });
+      if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    } else {
+      FOREACH_STREAM({
+        stream->config.cfg.g_input_bit_depth = input.bit_depth;
+      });
+    }
+
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream, &global));
 
@@ -1736,6 +2127,25 @@
     FOREACH_STREAM(open_output_file(stream, &global));
     FOREACH_STREAM(initialize_encoder(stream, &global));
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+    if (strcmp(global.codec->name, "vp9") == 0) {
+      // Check to see if at least one stream uses 16 bit internal.
+      // Currently assume that the bit_depths for all streams using
+      // highbitdepth are the same.
+      FOREACH_STREAM({
+        if (stream->config.use_16bit_internal) {
+          use_16bit_internal = 1;
+        }
+        if (stream->config.cfg.g_profile == 0) {
+          input_shift = 0;
+        } else {
+          input_shift = (int)stream->config.cfg.g_bit_depth -
+              stream->config.cfg.g_input_bit_depth;
+        }
+      });
+    }
+#endif
+
     frame_avail = 1;
     got_data = 0;
 
@@ -1773,10 +2183,45 @@
         frame_avail = 0;
 
       if (frames_in > global.skip_frames) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+        vpx_image_t *frame_to_encode;
+        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+          assert(use_16bit_internal);
+          // Input bit depth and stream bit depth do not match, so up
+          // shift frame to stream bit depth
+          if (!allocated_raw_shift) {
+            vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+                          input.width, input.height, 32);
+            allocated_raw_shift = 1;
+          }
+          img_upshift(&raw_shift, &raw, input_shift);
+          frame_to_encode = &raw_shift;
+        } else {
+          frame_to_encode = &raw;
+        }
+        vpx_usec_timer_start(&timer);
+        if (use_16bit_internal) {
+          assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH);
+          FOREACH_STREAM({
+            if (stream->config.use_16bit_internal)
+              encode_frame(stream, &global,
+                           frame_avail ? frame_to_encode : NULL,
+                           frames_in);
+            else
+              assert(0);
+          });
+        } else {
+          assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0);
+          FOREACH_STREAM(encode_frame(stream, &global,
+                                      frame_avail ? frame_to_encode : NULL,
+                                      frames_in));
+        }
+#else
         vpx_usec_timer_start(&timer);
         FOREACH_STREAM(encode_frame(stream, &global,
                                     frame_avail ? &raw : NULL,
                                     frames_in));
+#endif
         vpx_usec_timer_mark(&timer);
         cx_time += vpx_usec_timer_elapsed(&timer);
 
@@ -1785,7 +2230,8 @@
         got_data = 0;
         FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
 
-        if (!got_data && input.length && !streams->frames_out) {
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
           lagged_count = global.limit ? seen_frames : ftello(input.file);
         } else if (input.length) {
           int64_t remaining;
@@ -1893,6 +2339,10 @@
     });
 #endif
 
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+  if (allocated_raw_shift)
+    vpx_img_free(&raw_shift);
+#endif
   vpx_img_free(&raw);
   free(argv);
   free(streams);
diff --git a/y4minput.c b/y4minput.c
index 520c332..34ea96d 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -700,7 +700,7 @@
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
                    int only_420) {
-  char buffer[80];
+  char buffer[80] = {0};
   int  ret;
   int  i;
   /*Read until newline, or 80 cols, whichever happens first.*/
@@ -978,7 +978,9 @@
     _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
   else
     _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
-  _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
   return 0;
 }
 
@@ -1039,12 +1041,12 @@
   c_w *= bytes_per_sample;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] =
+  _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] =
       _y4m->pic_w * bytes_per_sample;
-  _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
-  _img->planes[PLANE_Y] = _y4m->dst_buf;
-  _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
-  _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
-  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+  _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w;
+  _img->planes[VPX_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }