Merge "Set scaled img format correctly in vpxdec"

diff --git a/examples.mk b/examples.mk
index 91bd45a..537d8ff 100644
--- a/examples.mk
+++ b/examples.mk

@@ -31,6 +31,7 @@
                 third_party/libyuv/source/scale_common.cc \
                 third_party/libyuv/source/scale_mips.cc \
                 third_party/libyuv/source/scale_neon.cc \
+                third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_posix.cc \
                 third_party/libyuv/source/scale_win.cc \
 

diff --git a/examples/set_maps.c b/examples/set_maps.c
index ff60d51..2ee5bca 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c

@@ -42,6 +42,7 @@
 // Use the `simple_decoder` example to decode this sample, and observe
 // the change in the image at frames 22, 33, and 44.
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -177,9 +178,10 @@
   memset(&info, 0, sizeof(info));
 
   encoder = get_vpx_encoder_by_name(argv[1]);
-  if (!encoder)
+  if (encoder == NULL) {
     die("Unsupported codec.");
-
+  }
+  assert(encoder != NULL);
   info.codec_fourcc = encoder->fourcc;
   info.frame_width = strtol(argv[2], NULL, 0);
   info.frame_height = strtol(argv[3], NULL, 0);

diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 369b1d8..76d5a28 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c

@@ -66,13 +66,14 @@
   exit(EXIT_FAILURE);
 }
 
-static void get_frame_stats(vpx_codec_ctx_t *ctx,
-                            const vpx_image_t *img,
-                            vpx_codec_pts_t pts,
-                            unsigned int duration,
-                            vpx_enc_frame_flags_t flags,
-                            unsigned int deadline,
-                            vpx_fixed_buf_t *stats) {
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+                           const vpx_image_t *img,
+                           vpx_codec_pts_t pts,
+                           unsigned int duration,
+                           vpx_enc_frame_flags_t flags,
+                           unsigned int deadline,
+                           vpx_fixed_buf_t *stats) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -81,6 +82,8 @@
     die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_STATS_PKT) {
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -89,15 +92,18 @@
       stats->sz += pkt_size;
     }
   }
+
+  return got_pkts;
 }
 
-static void encode_frame(vpx_codec_ctx_t *ctx,
-                         const vpx_image_t *img,
-                         vpx_codec_pts_t pts,
-                         unsigned int duration,
-                         vpx_enc_frame_flags_t flags,
-                         unsigned int deadline,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *ctx,
+                        const vpx_image_t *img,
+                        vpx_codec_pts_t pts,
+                        unsigned int duration,
+                        vpx_enc_frame_flags_t flags,
+                        unsigned int deadline,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -106,6 +112,7 @@
     die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
 
@@ -117,19 +124,90 @@
       fflush(stdout);
     }
   }
+
+  return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+                             FILE *infile,
+                             const VpxInterface *encoder,
+                             const vpx_codec_enc_cfg_t *cfg) {
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+  vpx_fixed_buf_t stats = {NULL, 0};
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
+                    &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+                         VPX_DL_BEST_QUALITY, &stats)) {}
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+                  FILE *infile,
+                  const char *outfile_name,
+                  const VpxInterface *encoder,
+                  const vpx_codec_enc_cfg_t *cfg) {
+  VpxVideoInfo info = {
+    encoder->fourcc,
+    cfg->g_w,
+    cfg->g_h,
+    {cfg->g_timebase.num, cfg->g_timebase.den}
+  };
+  VpxVideoWriter *writer = NULL;
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing", outfile_name);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_BEST_QUALITY, writer)) {}
+
+  printf("\n");
+
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
 }
 
 int main(int argc, char **argv) {
   FILE *infile = NULL;
-  VpxVideoWriter *writer = NULL;
+  int w, h;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  vpx_fixed_buf_t stats = {0};
-  VpxVideoInfo info = {0};
+  vpx_fixed_buf_t stats;
+
   const VpxInterface *encoder = NULL;
-  int pass;
   const int fps = 30;        // TODO(dkovalev) add command line argument
   const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
@@ -146,85 +224,44 @@
   if (!encoder)
     die("Unsupported codec.");
 
-  info.codec_fourcc = encoder->fourcc;
-  info.time_base.numerator = 1;
-  info.time_base.denominator = fps;
-  info.frame_width = strtol(width_arg, NULL, 0);
-  info.frame_height = strtol(height_arg, NULL, 0);
+  w = strtol(width_arg, NULL, 0);
+  h = strtol(height_arg, NULL, 0);
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
-    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
-  }
+  if (w  <= 0 || h <= 0 || (w % 2) != 0 || (h  % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
 
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
-    die("Failed to allocate image", info.frame_width, info.frame_height);
-  }
-
-  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing", outfile_arg);
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
+  // Configuration
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
   if (res)
     die_codec(&codec, "Failed to get default codec config.");
 
-  cfg.g_w = info.frame_width;
-  cfg.g_h = info.frame_height;
-  cfg.g_timebase.num = info.time_base.numerator;
-  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
   cfg.rc_target_bitrate = bitrate;
 
-  for (pass = 0; pass < 2; ++pass) {
-    int frame_count = 0;
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
 
-    if (pass == 0) {
-      cfg.g_pass = VPX_RC_FIRST_PASS;
-    } else {
-      cfg.g_pass = VPX_RC_LAST_PASS;
-      cfg.rc_twopass_stats_in = stats;
-    }
+  // Pass 0
+  cfg.g_pass = VPX_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg);
 
-    if (!(infile = fopen(infile_arg, "rb")))
-      die("Failed to open %s for reading", infile_arg);
-
-    if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-      die_codec(&codec, "Failed to initialize encoder");
-
-    while (vpx_img_read(&raw, infile)) {
-      ++frame_count;
-
-      if (pass == 0) {
-        get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                        &stats);
-      } else {
-        encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                     writer);
-      }
-    }
-
-    if (pass == 0) {
-      get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
-                      &stats);
-    } else {
-      printf("\n");
-    }
-
-    fclose(infile);
-    printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count);
-    if (vpx_codec_destroy(&codec))
-      die_codec(&codec, "Failed to destroy codec.");
-  }
-
-  vpx_img_free(&raw);
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = VPX_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg);
   free(stats.buf);
 
-  vpx_video_writer_close(writer);
+  vpx_img_free(&raw);
+  fclose(infile);
 
   return EXIT_SUCCESS;
 }

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 4ec1848..5eac92c 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c

@@ -12,6 +12,7 @@
 //  encoding scheme based on temporal scalability for video applications
 //  that benefit from a scalable bitstream.
 
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -438,7 +439,7 @@
 }
 
 int main(int argc, char **argv) {
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS];
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t cfg;
   int frame_cnt = 0;
@@ -456,7 +457,6 @@
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
   int flag_periodicity = 1;
-  int max_intra_size_pct;
   vpx_svc_layer_id_t layer_id = {0, 0};
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
@@ -570,6 +570,8 @@
     outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
     if (!outfile[i])
       die("Failed to open %s for writing", file_name);
+
+    assert(outfile[i] != NULL);
   }
   // No spatial layers in this encoder.
   cfg.ss_number_layers = 1;
@@ -595,11 +597,11 @@
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
-  max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
-      * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
-  // For low-quality key frame.
-  max_intra_size_pct = 200;
-  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+  {
+    const int max_intra_size_pct = 200;
+    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
 
   frame_avail = 1;
   while (frame_avail || got_data) {

diff --git a/libs.mk b/libs.mk
index 25fbc2c..c7c2748 100644
--- a/libs.mk
+++ b/libs.mk

@@ -133,6 +133,8 @@
   CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
 
 ifeq ($(CONFIG_ENCODERS),yes)
   CODEC_DOC_SECTIONS += encoder

diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index a9bb540..0221995 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc

@@ -38,7 +38,7 @@
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
     } else if (video->frame() == 3) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       uint8_t active_map[9 * 13] = {
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
         1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
@@ -57,7 +57,7 @@
       map.active_map = active_map;
       encoder->Control(VP8E_SET_ACTIVEMAP, &map);
     } else if (video->frame() == 15) {
-      vpx_active_map_t map = {0};
+      vpx_active_map_t map = vpx_active_map_t();
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
       map.active_map = NULL;

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5b4a20e..1724db3 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -646,26 +646,6 @@
 #endif
 
 #if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h);
-void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
-void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
-                        int w, int h);
-}
-
 const ConvolveFunctions convolve8_avx2(
     vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
     vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -676,9 +656,7 @@
     make_tuple(8, 4, &convolve8_avx2),
     make_tuple(4, 8, &convolve8_avx2),
     make_tuple(8, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2)));
-
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
+    make_tuple(8, 16, &convolve8_avx2),
     make_tuple(16, 8, &convolve8_avx2),
     make_tuple(16, 16, &convolve8_avx2),
     make_tuple(32, 16, &convolve8_avx2),

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 8dcf26c..a8d8d42 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -41,7 +41,7 @@
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+                                  ::libvpx_test::Encoder* /*encoder*/) {
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ee417ce..c38cc2e 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -268,11 +268,13 @@
 typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
 
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct16x16_ref(const int16_t *in, int16_t *out, int stride,
+                   int /*tx_type*/) {
   vp9_fdct16x16_c(in, out, stride);
 }
 
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride,
+                   int /*tx_type*/) {
   vp9_idct16x16_256_add_c(in, dest, stride);
 }
 

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 4f34a44..d2d437c 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -37,7 +37,7 @@
 
 const int kNumCoeffs = 1024;
 const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -55,7 +55,7 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = input[j*32 + i];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 32; ++j)
       output[j * 32 + i] = temp_out[j];
   }
@@ -64,7 +64,7 @@
     double temp_in[32], temp_out[32];
     for (int j = 0; j < 32; ++j)
       temp_in[j] = output[j + i*32];
-    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    reference_32x32_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 32; ++j)
       output[j + i * 32] = temp_out[j] / 4;

diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 11529b3..5a71140 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc

@@ -74,7 +74,7 @@
   libvpx_test::WebMVideoSource video(video_name);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 99610eb..0ef4f7b 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -106,7 +106,7 @@
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   RunLoop(video, dec_cfg);
 }
 

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 1f73c7d..a757b59 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -125,20 +125,20 @@
                        const vpx_codec_dec_cfg_t &dec_cfg);
 
   // Hook to be called before decompressing every frame.
-  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
-                                  Decoder *decoder) {}
+  virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+                                  Decoder* /*decoder*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const CompressedVideoSource& /* video */,
+                                  const CompressedVideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     const unsigned int frame_number) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     const unsigned int /*frame_number*/) {}
 
   // Hook to be called on peek result
   virtual void HandlePeekResult(Decoder* const decoder,

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 6d4281d..9702ddf 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -133,13 +133,13 @@
   return match;
 }
 
-void EncoderTest::MismatchHook(const vpx_image_t *img1,
-                               const vpx_image_t *img2) {
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+                               const vpx_image_t* /*img2*/) {
   ASSERT_TRUE(0) << "Encode/Decode mismatch found";
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
 
   stats_.Reset();
 

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 2270ce2..a77bd64 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -189,20 +189,21 @@
   virtual void RunLoop(VideoSource *video);
 
   // Hook to be called at the beginning of a pass.
-  virtual void BeginPassHook(unsigned int pass) {}
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
 
   // Hook to be called at the end of a pass.
   virtual void EndPassHook() {}
 
   // Hook to be called before encoding a frame.
-  virtual void PreEncodeFrameHook(VideoSource *video) {}
-  virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+                                  Encoder* /*encoder*/) {}
 
   // Hook to be called on every compressed data packet.
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
 
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
@@ -218,19 +219,19 @@
                             const vpx_image_t *img2);
 
   // Hook to be called on every decompressed frame.
-  virtual void DecompressedFrameHook(const vpx_image_t& img,
-                                     vpx_codec_pts_t pts) {}
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     vpx_codec_pts_t /*pts*/) {}
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const VideoSource& /* video */,
+                                  const VideoSource& /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
     return VPX_CODEC_OK == res_dec;
   }
 
   // Hook that can modify the encoder's output data
-  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
       const vpx_codec_cx_pkt_t *pkt) {
     return pkt;
   }

diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index fb0449d..44eba33 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc

@@ -285,7 +285,7 @@
     video_->Init();
     video_->Begin();
 
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 7c48260..08a69ab 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -40,7 +40,7 @@
 typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
 
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
   vp9_fdct4x4_c(in, out, stride);
 }
 
@@ -48,7 +48,7 @@
   vp9_fht4x4_c(in, out, stride, tx_type);
 }
 
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
   vp9_fwht4x4_c(in, out, stride);
 }
 

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 567e5f6..a694f0c 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -39,7 +39,7 @@
 typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
 
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
   vp9_fdct8x8_c(in, out, stride);
 }
 

diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index db27975..1c9a522 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc

@@ -27,7 +27,7 @@
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource &video,
+                                  const libvpx_test::VideoSource& /*video*/,
                                   libvpx_test::Decoder *decoder) {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 0a1c17c..34d6236 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -73,7 +73,7 @@
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
     libvpx_test::CompressedVideoSource *video = NULL;
-    vpx_codec_dec_cfg_t cfg = {0};
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.threads = input.threads;
     const std::string filename = input.filename;
 
@@ -126,9 +126,9 @@
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const decoder,
-                                libvpx_test::CompressedVideoSource *video,
-                                const vpx_codec_err_t res_peek) {}
+  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                                libvpx_test::CompressedVideoSource* /*video*/,
+                                const vpx_codec_err_t /*res_peek*/) {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {

diff --git a/test/resize_test.cc b/test/resize_test.cc
index 8d08f1e..9d0c570 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc

@@ -211,8 +211,8 @@
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
 #if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -222,8 +222,8 @@
     // Write frame header and data.
     write_ivf_frame_header(pkt, outfile_);
     (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
   }
+#endif
 
   double frame0_psnr_;
 #if WRITE_COMPRESSED_STREAM

diff --git a/test/svc_test.cc b/test/svc_test.cc
index 1cb01a4..e219488 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc

@@ -60,7 +60,7 @@
     codec_enc_.kf_min_dist = 100;
     codec_enc_.kf_max_dist = 100;
 
-    vpx_codec_dec_cfg_t dec_cfg = {0};
+    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
   }
@@ -112,7 +112,7 @@
       video.Next();
     }
 
-    // Flush encoder and test EOS packet
+    // Flush encoder and test EOS packet.
     res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
                          video.duration(), VPX_DL_GOOD_QUALITY);
     stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
@@ -135,7 +135,7 @@
         EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
       }
 
-      outputs[*frame_received].buf = malloc(frame_size);
+      outputs[*frame_received].buf = malloc(frame_size + 16);
       ASSERT_TRUE(outputs[*frame_received].buf != NULL);
       memcpy(outputs[*frame_received].buf, vpx_svc_get_buffer(&svc_),
              frame_size);
@@ -176,13 +176,13 @@
       video.Next();
     }
 
-    // Flush Encoder
+    // Flush encoder.
     res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
                          video.duration(), VPX_DL_GOOD_QUALITY);
     EXPECT_EQ(VPX_CODEC_OK, res);
     StoreFrames(n, outputs, &frame_received);
 
-    EXPECT_EQ(frame_received, (size_t)n);
+    EXPECT_EQ(frame_received, static_cast<size_t>(n));
 
     ReleaseEncoder();
   }
@@ -204,7 +204,7 @@
       ++decoded_frames;
 
       DxDataIterator dec_iter = decoder_->GetDxData();
-      while (dec_iter.Next()) {
+      while (dec_iter.Next() != NULL) {
         ++received_frames;
       }
     }
@@ -214,7 +214,8 @@
 
   void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
                              const int num_super_frames,
-                             const int remained_layers) {
+                             const int remained_layers,
+                             const bool is_multiple_frame_context) {
     ASSERT_TRUE(inputs != NULL);
     ASSERT_GT(num_super_frames, 0);
     ASSERT_GT(remained_layers, 0);
@@ -236,7 +237,7 @@
       uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
       uint8_t *frame_start = frame_data;
       for (frame = 0; frame < frame_count; ++frame) {
-        // Looking for a visible frame
+        // Looking for a visible frame.
         if (frame_data[0] & 0x02) {
           ++frames_found;
           if (frames_found == remained_layers)
@@ -244,11 +245,17 @@
         }
         frame_data += frame_sizes[frame];
       }
-      ASSERT_LT(frame, frame_count);
-      if (frame == frame_count - 1)
+      ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+          << "remaining_layers: " << remained_layers
+          << "    super_frame: " << i
+          << "    is_multiple_frame_context: " << is_multiple_frame_context;
+      if (frame == frame_count - 1 && !is_multiple_frame_context)
         continue;
 
       frame_data += frame_sizes[frame];
+      // We need to add one more frame for multiple frame context.
+      if (is_multiple_frame_context)
+        ++frame;
       uint8_t marker =
           static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
       const uint32_t mag = ((marker >> 3) & 0x3) + 1;
@@ -256,11 +263,37 @@
       const size_t new_index_sz = 2 + mag * (frame + 1);
       marker &= 0x0f8;
       marker |= frame;
+
+      // Copy existing frame sizes.
+      memmove(frame_data + (is_multiple_frame_context ? 2 : 1),
+              frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2);
+      if (is_multiple_frame_context) {
+        // Add a one byte frame with flag show_existing frame.
+        *frame_data++ = 0x88 | (remained_layers - 1);
+      }
+      // New marker.
       frame_data[0] = marker;
-      memcpy(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
-             new_index_sz - 2);
-      frame_data[new_index_sz - 1] = marker;
-      inputs[i].sz = frame_data - frame_start + new_index_sz;
+      frame_data += (mag * (frame + 1) + 1);
+
+      if (is_multiple_frame_context) {
+        // Write the frame size for the one byte frame.
+        frame_data -= mag;
+        *frame_data++ = 1;
+        for (uint32_t j = 1; j < mag; ++j) {
+          *frame_data++ = 0;
+        }
+      }
+
+      *frame_data++ = marker;
+      inputs[i].sz = frame_data - frame_start;
+
+      if (is_multiple_frame_context) {
+        // Change the show frame flag to 0 for all frames.
+        for (int j = 0; j < frame; ++j) {
+          frame_start[0] &= ~2;
+          frame_start += frame_sizes[j];
+        }
+      }
     }
   }
 
@@ -507,7 +540,7 @@
   vpx_fixed_buf outputs[10];
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
+  DropEnhancementLayers(&outputs[0], 10, 1, false);
   DecodeNFrames(&outputs[0], 10);
   FreeBitstreamBuffers(&outputs[0], 10);
 }
@@ -525,13 +558,13 @@
   Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
 
   DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 4);
+  DropEnhancementLayers(&outputs[0], 10, 4, false);
   DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 3);
+  DropEnhancementLayers(&outputs[0], 10, 3, false);
   DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
+  DropEnhancementLayers(&outputs[0], 10, 2, false);
   DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
+  DropEnhancementLayers(&outputs[0], 10, 1, false);
   DecodeNFrames(&outputs[0], 10);
 
   FreeBitstreamBuffers(&outputs[0], 10);
@@ -568,12 +601,121 @@
   memset(&outputs[0], 0, sizeof(outputs));
   Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
   DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 2);
+  DropEnhancementLayers(&outputs[0], 20, 2, false);
   DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 1);
+  DropEnhancementLayers(&outputs[0], 20, 1, false);
   DecodeNFrames(&outputs[0], 20);
 
   FreeBitstreamBuffers(&outputs[0], 20);
 }
 
+TEST_F(SvcTest, SetMultipleFrameContextOption) {
+  svc_.spatial_layers = 5;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, TwoPassEncode2LayersWithMultipleFrameContext) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 2, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2LayersWithMultipleFrameContextDecodeBaselayer) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContext) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 2, true);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode3SNRLayersWithMultipleFrameContextDecode321Layer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(10, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+  vpx_fixed_buf outputs_new[10];
+  for (int i = 0; i < 10; ++i) {
+    outputs_new[i].buf = malloc(outputs[i].sz + 16);
+    ASSERT_TRUE(outputs_new[i].buf != NULL);
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
+  }
+  DropEnhancementLayers(&outputs_new[0], 10, 3, true);
+  DecodeNFrames(&outputs_new[0], 10);
+
+  for (int i = 0; i < 10; ++i) {
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
+  }
+  DropEnhancementLayers(&outputs_new[0], 10, 2, true);
+  DecodeNFrames(&outputs_new[0], 10);
+
+  for (int i = 0; i < 10; ++i) {
+    memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+    outputs_new[i].sz = outputs[i].sz;
+  }
+  DropEnhancementLayers(&outputs_new[0], 10, 1, true);
+  DecodeNFrames(&outputs_new[0], 10);
+
+  FreeBitstreamBuffers(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs_new[0], 10);
+}
+
 }  // namespace

diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d714452..b9f879d 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc

@@ -29,7 +29,7 @@
         md5_inv_order_(),
         n_tiles_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    vpx_codec_dec_cfg_t cfg;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.w = 704;
     cfg.h = 144;
     cfg.threads = 1;

diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
index 22fce85..8512d88 100644
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   libvpx_test::VP9Decoder decoder(cfg, 0);
 
   libvpx_test::MD5 md5;

diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
index 470fdf1..972a1d9 100644
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP8Decoder decoder(dec_cfg, 0);
 
   video.Begin();

diff --git a/test/vp9_decrypt_test.cc b/test/vp9_decrypt_test.cc
index 88a3c14..d988612 100644
--- a/test/vp9_decrypt_test.cc
+++ b/test/vp9_decrypt_test.cc

@@ -47,7 +47,7 @@
   libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
   video.Init();
 
-  vpx_codec_dec_cfg_t dec_cfg = {0};
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
   VP9Decoder decoder(dec_cfg, 0);
 
   video.Begin();

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index d7fc4ee..cc35476 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc

@@ -163,7 +163,7 @@
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   cfg.threads = num_threads;
   libvpx_test::VP9Decoder decoder(cfg, 0);
 

diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index b6482c6..9674bdc 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh

@@ -41,6 +41,40 @@
   fi
 }
 
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+  local readonly codec="${1:-vp8}"
+  echo "--codec=${codec}
+    --buf-initial-sz=500
+    --buf-optimal-sz=600
+    --buf-sz=1000
+    --cpu-used=-5
+    --end-usage=cbr
+    --error-resilient=1
+    --kf-max-dist=90000
+    --lag-in-frames=0
+    --max-intra-rate=300
+    --max-q=56
+    --min-q=2
+    --noise-sensitivity=0
+    --overshoot-pct=50
+    --passes=1
+    --profile=0
+    --resize-allowed=0
+    --rt
+    --static-thresh=0
+    --undershoot-pct=50"
+}
+
 # Wrapper function for running vpxenc with pipe input. Requires that
 # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
 # input file path and shifted away. All remaining parameters are passed through
@@ -59,9 +93,9 @@
 # shifted away. All remaining parameters are passed through to vpxenc.
 vpxenc() {
   local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly input="${1}"
+  local readonly input="$1"
   shift
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" \
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
     --test-decode=fatal \
     "$@" ${devnull}
 }
@@ -69,13 +103,11 @@
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -88,12 +120,10 @@
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -102,17 +132,29 @@
   fi
 }
 
+vpxenc_vp8_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp8) \
+      --output="${output}"
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
 vpxenc_vp8_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      --passes=2 \
-      "${YUV_RAW_INPUT}"
+      --passes=2
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -127,15 +169,13 @@
     local readonly lag_total_frames=20
     local readonly lag_frames=10
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
-    vpxenc --codec=vp8 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
       --auto-alt-ref=1 \
-      --passes=2 \
-      "${YUV_RAW_INPUT}"
+      --passes=2
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -147,14 +187,11 @@
 vpxenc_vp8_ivf_piped_input() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
-    cat "${YUV_RAW_INPUT}" \
-      | vpxenc --codec=vp8 \
-        --width="${YUV_RAW_INPUT_WIDTH}" \
-        --height="${YUV_RAW_INPUT_HEIGHT}" \
-        --limit="${TEST_FRAMES}" \
-        --ivf \
-        --output="${output}" \
-        -
+    vpxenc_pipe $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -166,13 +203,11 @@
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -185,12 +220,25 @@
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
-      --output="${output}" \
-      "${YUV_RAW_INPUT}"
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp9) \
+      --output="${output}"
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -203,14 +251,11 @@
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
-      --test-decode=fatal \
       --output="${output}" \
-      --passes=2 \
-      "${YUV_RAW_INPUT}"
+      --passes=2
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -222,14 +267,12 @@
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
-      --lossless=1 \
-      "${YUV_RAW_INPUT}"
+      --lossless=1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -241,15 +284,13 @@
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
       --min-q=0 \
-      --max-q=0 \
-      "${YUV_RAW_INPUT}"
+      --max-q=0
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -264,16 +305,13 @@
     local readonly lag_total_frames=20
     local readonly lag_frames=10
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
-    vpxenc --codec=vp9 \
-      --width="${YUV_RAW_INPUT_WIDTH}" \
-      --height="${YUV_RAW_INPUT_HEIGHT}" \
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
-      --test-decode=fatal \
       --passes=2 \
-      --auto-alt-ref=1 \
-      "${YUV_RAW_INPUT}"
+      --auto-alt-ref=1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -284,11 +322,13 @@
 
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
+              vpxenc_vp8_webm_rt
               vpxenc_vp8_webm_2pass
               vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
+              vpxenc_vp9_webm_rt
               vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0

diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index fa5b498..fb2b9d1 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1041
+Version: 1060
 License: BSD
 License File: LICENSE
 

diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 82fd95d..8423121 100644
--- a/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h

@@ -153,7 +153,6 @@
      int* subsample_x, int* subsample_y, int number_of_components);
 
  private:
-
   void AllocOutputBuffers(int num_outbufs);
   void DestroyOutputBuffers();
 

diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index fdfe1ae..4b3c870 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h

@@ -252,6 +252,94 @@
 
 // The following are available on arm64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+// #define HAS_I444TOARGBROW_NEON
+// #define HAS_I422TOARGBROW_NEON
+// #define HAS_I411TOARGBROW_NEON
+// #define HAS_I422TOBGRAROW_NEON
+// #define HAS_I422TOABGRROW_NEON
+// #define HAS_I422TORGBAROW_NEON
+// #define HAS_I422TORGB24ROW_NEON
+// #define HAS_I422TORAWROW_NEON
+// #define HAS_I422TORGB565ROW_NEON
+// #define HAS_I422TOARGB1555ROW_NEON
+// #define HAS_I422TOARGB4444ROW_NEON
+// #define HAS_YTOARGBROW_NEON
+// #define HAS_I400TOARGBROW_NEON
+// #define HAS_NV12TOARGBROW_NEON
+// #define HAS_NV21TOARGBROW_NEON
+// #define HAS_NV12TORGB565ROW_NEON
+// #define HAS_NV21TORGB565ROW_NEON
+// #define HAS_YUY2TOARGBROW_NEON
+// #define HAS_UYVYTOARGBROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROWS_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+// #define HAS_RGB565TOARGBROW_NEON
+// #define HAS_ARGB1555TOARGBROW_NEON
+// #define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+// #define HAS_ARGBTORGB565ROW_NEON
+// #define HAS_ARGBTOARGB1555ROW_NEON
+// #define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+// #define HAS_ARGBTOUV444ROW_NEON
+// #define HAS_ARGBTOUV422ROW_NEON
+// #define HAS_ARGBTOUV411ROW_NEON
+// #define HAS_ARGBTOUVROW_NEON
+// #define HAS_ARGBTOUVJROW_NEON
+// #define HAS_BGRATOUVROW_NEON
+// #define HAS_ABGRTOUVROW_NEON
+// #define HAS_RGBATOUVROW_NEON
+// #define HAS_RGB24TOUVROW_NEON
+// #define HAS_RAWTOUVROW_NEON
+// #define HAS_RGB565TOUVROW_NEON
+// #define HAS_ARGB1555TOUVROW_NEON
+// #define HAS_ARGB4444TOUVROW_NEON
+// #define HAS_RGB565TOYROW_NEON
+// #define HAS_ARGB1555TOYROW_NEON
+// #define HAS_ARGB4444TOYROW_NEON
+// #define HAS_BGRATOYROW_NEON
+// #define HAS_ABGRTOYROW_NEON
+// #define HAS_RGBATOYROW_NEON
+// #define HAS_RGB24TOYROW_NEON
+// #define HAS_RAWTOYROW_NEON
+// #define HAS_INTERPOLATEROW_NEON
+// #define HAS_ARGBBLENDROW_NEON
+// #define HAS_ARGBATTENUATEROW_NEON
+// #define HAS_ARGBQUANTIZEROW_NEON
+// #define HAS_ARGBSHADEROW_NEON
+// #define HAS_ARGBGRAYROW_NEON
+// #define HAS_ARGBSEPIAROW_NEON
+// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
 #endif
 
 // The following are available on Neon platforms:
@@ -465,7 +553,7 @@
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
 #endif  // defined(__native_client__) && defined(__x86_64__)
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 #undef MEMACCESS
 #if defined(__native_client__)
 #define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"

diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 8dc0762..3c49542 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h

@@ -51,6 +51,14 @@
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
+#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__aarch64__) || defined(LIBYUV_NEON))
+/* #define HAS_SCALEROWDOWN2_NEON */
+/* #define HAS_SCALEROWDOWN4_NEON */
+/* #define HAS_SCALEROWDOWN34_NEON */
+/* #define HAS_SCALEROWDOWN38_NEON */
+/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
+/* #define HAS_SCALEARGBROWDOWN2_NEON */
 #endif
 
 // The following are available on Mips platforms:

diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
index 912c4c9..73a7f1b 100644
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h

@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1041
+#define LIBYUV_VERSION 1059
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT

diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc
index 9ea81b4..dc715e0 100644
--- a/third_party/libyuv/source/compare.cc
+++ b/third_party/libyuv/source/compare.cc

@@ -80,7 +80,7 @@
 
 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #endif

diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc
index 5e7b8e4..55052c0 100644
--- a/third_party/libyuv/source/compare_neon.cc
+++ b/third_party/libyuv/source/compare_neon.cc

@@ -56,6 +56,45 @@
   return sse;
 }
 
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %2, %2, #16                    \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "bgt        1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus

diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
index 874a6cb..a8e294f 100644
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc

@@ -401,7 +401,7 @@
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
   int y;
-  int halfheight = (height + 1) >> 1;
+  int halfheight;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
   void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
       int pix) = YUY2ToUV422Row_C;
@@ -711,11 +711,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -963,9 +965,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1022,36 +1021,44 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB24TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
-    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
-    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-    RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-    RGB24ToARGBRow(src_rgb24, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_RGB24TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
@@ -1075,9 +1082,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_raw || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1134,36 +1138,42 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RAWTOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
-    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-    RAWToARGBRow(src_raw, row, width);
-    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_raw += src_stride_raw * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+  #if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+  #else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+  #endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+  #if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+  #else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+  #endif
+    }
+  #if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  #endif
   }
-  if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
-    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-    RAWToYRow(src_raw, dst_y, width);
-#else
-    RAWToARGBRow(src_raw, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
-#if !defined(HAS_RAWTOYROW_NEON)
-  free_aligned_buffer_64(row);
-#endif
   return 0;
 }
 
@@ -1187,9 +1197,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1246,36 +1253,44 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_RGB565TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
-    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_rgb565 += src_stride_rgb565 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
-    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-    RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-    RGB565ToARGBRow(src_rgb565, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_RGB565TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 
@@ -1299,9 +1314,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1358,38 +1370,45 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB1555TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
-    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                   width);
-#else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+  {
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
-    src_argb1555 += src_stride_argb1555 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
+    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
-    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-    ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
 #else
-    ARGB1555ToARGBRow(src_argb1555, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-  }
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
 #if !defined(HAS_ARGB1555TOYROW_NEON)
   free_aligned_buffer_64(row);
 #endif
+  }
   return 0;
 }
 
@@ -1413,9 +1432,6 @@
       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-  // Allocate 2 rows of ARGB.
-  const int kRowSize = (width * 4 + 15) & ~15;
-  align_buffer_64(row, kRowSize * 2);
 #endif
   if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
       width <= 0 || height == 0) {
@@ -1472,38 +1488,46 @@
 #endif  // HAS_ARGBTOUVROW_SSSE3
 #endif  // HAS_ARGB4444TOYROW_NEON
 
-  for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
-    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                   width);
-#else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
-                      width);
-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
-    src_argb4444 += src_stride_argb4444 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
-    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-    ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
-    ARGB4444ToARGBRow(src_argb4444, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
-    ARGBToYRow(row, dst_y, width);
-#endif
-  }
+  {
 #if !defined(HAS_ARGB4444TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
 #endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+#endif
+  }
   return 0;
 }
 

diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
index 121a416..de461dd 100644
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc

@@ -60,6 +60,13 @@
         }
       }
   }
+#elif defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -76,10 +83,8 @@
 #elif defined(HAS_ARGBTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
-      ARGBToUV444Row = ARGBToUV444Row_NEON;
     }
   }
 #endif
@@ -134,6 +139,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 
 #if defined(HAS_ARGBTOYROW_SSSE3)
@@ -153,12 +165,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -228,11 +234,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 32) {
-      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-      if (IS_ALIGNED(width, 32)) {
-        ARGBToUV411Row = ARGBToUV411Row_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
     }
   }
 #endif
@@ -261,9 +269,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -296,11 +301,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -331,22 +338,27 @@
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -364,9 +376,6 @@
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
   if (!src_argb ||
       !dst_y || !dst_uv ||
       width <= 0 || height == 0) {
@@ -399,11 +408,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif
@@ -434,22 +445,27 @@
     }
   }
 #endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
 
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
   }
-  if (height & 1) {
-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
-    ARGBToYRow(src_argb, dst_y, width);
-  }
-  free_aligned_buffer_64(row_u);
   return 0;
 }
 
@@ -493,6 +509,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -510,12 +533,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -594,6 +611,13 @@
       }
     }
   }
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -611,12 +635,6 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_NEON;
-      }
-    }
   }
 #endif
 
@@ -1022,11 +1040,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
     }
   }
 #endif

diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index 2e0d61d..deb4c44 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc

@@ -14,7 +14,7 @@
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && defined(_M_X64) && \
+    !defined(__native_client__)  && \
     defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
@@ -97,7 +97,7 @@
   uint32 xcr0 = 0u;
 #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(_M_IX86)
+#elif defined(_M_IX86) && defined(_MSC_VER)
   __asm {
     xor        ecx, ecx    // xcr 0
     _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
@@ -256,12 +256,17 @@
   if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
     cpu_info_ &= ~kCpuHasMIPS_DSPR2;
   }
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
 #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
   cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#elif defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
 #else
   // Linux arm parse text file for neon detect.
   cpu_info_ = ArmCpuCaps("/proc/cpuinfo");

diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc
index a3daf96..3c17371 100644
--- a/third_party/libyuv/source/format_conversion.cc
+++ b/third_party/libyuv/source/format_conversion.cc

@@ -332,11 +332,13 @@
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
     }
   }
 #endif

diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc
index 15b0ed8..36028c3 100644
--- a/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/third_party/libyuv/source/mjpeg_decoder.cc

@@ -13,8 +13,8 @@
 #ifdef HAVE_JPEG
 #include <assert.h>
 
-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
-    !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
@@ -101,7 +101,7 @@
   }
 
   buf_.data = src;
-  buf_.len = (int)(src_len);
+  buf_.len = static_cast<int>(src_len);
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -411,7 +411,7 @@
 }
 
 boolean fill_input_buffer(j_decompress_ptr cinfo) {
-  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
   if (buf_vec->pos >= buf_vec->len) {
     assert(0 && "No more data");
     // ERROR: No more data
@@ -447,7 +447,7 @@
   // ERROR: Error in jpeglib: buf
 #endif
 
-  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
   // This rewinds the call stack to the point of the corresponding setjmp()
   // and causes it to return (for a second time) with value 1.
   longjmp(mgr->setjmp_buffer, 1);

diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 97ef844..ce8b3da 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc

@@ -79,9 +79,13 @@
 YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
      1, 2, 7)
 YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
 #endif  // HAS_I422TOARGBROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+#endif  // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
+#endif  // HAS_I422TOUYVYROW_NEON
 #undef YANY
 
 // Wrappers to handle odd width
@@ -250,12 +254,26 @@
 YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
 YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
 YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
 YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
 YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
 YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
 YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
 YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
 YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
 YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
 #endif
 #undef YANY
@@ -333,7 +351,11 @@
 UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
 UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
 UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
 UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
 UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
 #endif
 #undef UVANY

diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
index 46e9ceb..21111cf 100644
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc

@@ -824,19 +824,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
     MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
     "bgt        1b                             \n"
     :
       "+r"(src_u),   // %0
@@ -862,7 +862,7 @@
       "+r"(dst_uv),  // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2  // Output registers
   :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@
 #ifdef HAS_SETROW_NEON
 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "st1        {v0.16b}, [%0], #16            \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
   : "r"(v32)     // %2
-  : "cc", "memory", "q0"
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_SETROW_NEON
@@ -922,26 +922,25 @@
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                    \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@
                       int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r12, #-16                      \n"
     "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    "st1        {v0.8b}, [%1], #8               \n"  // dst += 8
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
+    "st1        {v1.8b}, [%2], #8               \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_u),   // %1
     "+r"(dst_v),   // %2
     "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +1005,20 @@
 #ifdef HAS_RGB24TOARGBROW_NEON
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
+    "movi       v4.8b, #255                    \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "ld3        {v1.8b-v3.8b}, [%0], #24       \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "st4        {v1.8b-v4.8b}, [%1], #32       \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),   // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_RGB24TOARGBROW_NEON
@@ -1028,21 +1026,22 @@
 #ifdef HAS_RAWTOARGBROW_NEON
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
+    "movi       v5.8b, #255                    \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "ld3        {v0.8b-v2.8b}, [%0], #24       \n"  // read r g b
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "mov        v3.8b, v1.8b                   \n"  // move g
+    "mov        v4.8b, v0.8b                   \n"  // move r
     MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "st4        {v2.8b-v5.8b}, [%1], #32       \n"  // store b g r a
     "bgt        1b                             \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_RAWTOARGBROW_NEON
@@ -1170,16 +1169,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "st3        {v1.8b-v3.8b}, [%1], #24       \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_rgb24),  // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORGB24ROW_NEON
@@ -1190,17 +1189,18 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load b g r a
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
+    "mov        v4.8b, v2.8b                   \n"  // mov g
+    "mov        v5.8b, v1.8b                   \n"  // mov b
     MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "st3        {v3.8b-v5.8b}, [%1], #24       \n"  // store r g b
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_raw),   // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTORAWROW_NEON
@@ -1211,16 +1211,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOYROW_NEON
@@ -1231,16 +1231,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOYROW_NEON
@@ -1252,19 +1252,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUV422ROW_NEON
@@ -1276,19 +1276,19 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUV422ROW_NEON
@@ -1297,20 +1297,20 @@
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row YUY2.
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),     // %0
     "+r"(stride_yuy2),  // %1
@@ -1318,7 +1318,7 @@
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUVROW_NEON
@@ -1327,20 +1327,20 @@
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row UYVY.
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),     // %0
     "+r"(stride_uyvy),  // %1
@@ -1348,7 +1348,7 @@
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUVROW_NEON
@@ -1358,23 +1358,23 @@
                   uint8* dst_uv, int pix) {
   asm volatile (
     // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
+    "add        %x1, %x0, %w1, sxtw            \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    "ld1        {v1.16b}, [%1], #16            \n"  // load row 2 16 pixels.
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"  // average row 1 and 2
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
+    "st1        {v0.16b}, [%2], #16            \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),         // %0
     "+r"(src_uv_stride),  // %1
     "+r"(dst_uv),         // %2
     "+r"(pix)             // %3
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_HALFROW_NEON
@@ -1384,22 +1384,22 @@
 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                          uint32 selector, int pix) {
   asm volatile (
-    "vmov.u32   d6[0], %3                      \n"  // selector
+    "mov        v2.s[0], %w3                   \n"  // selector
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
+    "ld1        {v0.16b, v1.16b}, [%0], 32     \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    "tbl        v4.8b, {v0.16b}, v2.8b         \n"  // look up 4 pixels
+    "tbl        v5.8b, {v1.16b}, v2.8b         \n"  // look up 4 pixels
+    "trn1       v4.4s, v4.4s, v5.4s            \n"  // combine 8 pixels
     MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
+    "st1        {v4.8b}, [%1], #8              \n"  // store 8.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
   : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v4", "v5"   // Clobber List
   );
 }
 #endif  // HAS_ARGBTOBAYERROW_NEON
@@ -1411,16 +1411,16 @@
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
   :
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_NEON
@@ -1431,21 +1431,20 @@
                          const uint8* shuffler, int pix) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
     "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_NEON
@@ -1459,14 +1458,15 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "mov        v2.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 YUY2/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
@@ -1474,7 +1474,7 @@
     "+r"(dst_yuy2),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOYUY2ROW_NEON
@@ -1488,14 +1488,15 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    "ld2        {v1.8b, v2.8b}, [%0], #16      \n"  // load 16 Ys
+    "mov        v3.8b, v2.8b                   \n"
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 UYVY/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
@@ -1503,7 +1504,7 @@
     "+r"(dst_uyvy),  // %3
     "+r"(width)      // %4
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_I422TOUYVYROW_NEON
@@ -1577,28 +1578,28 @@
 #ifdef HAS_ARGBTOYROW_NEON
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1607,26 @@
 #ifdef HAS_ARGBTOYJROW_NEON
 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBTOYJROW_NEON
@@ -3048,20 +3049,20 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3069,7 +3070,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBMULTIPLYROW_NEON
@@ -3083,14 +3084,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3098,7 +3101,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBADDROW_NEON
@@ -3112,14 +3115,16 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
   : "+r"(src_argb0),  // %0
@@ -3127,7 +3132,7 @@
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +3146,27 @@
 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "mov        v1.8b, v0.8b                   \n"
+    "mov        v2.8b, v0.8b                   \n"
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELROW_NEON
@@ -3175,20 +3180,20 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +3207,25 @@
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
+    "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
     MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "cc", "memory", "q0", "q1"
+  : "cc", "memory", "v0", "v1", "v2", "v3"
   );
 }
 #endif  // HAS_SOBELXYROW_NEON
@@ -3236,28 +3241,28 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
     MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
     MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
+    "ld1        {v3.8b}, [%2],%6               \n"
     "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -3266,7 +3271,7 @@
     "+r"(width)        // %4
   : "r"(2),            // %5
     "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELXROW_NEON
@@ -3282,28 +3287,28 @@
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
+    "ld1        {v3.8b}, [%1],%5               \n"
     "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -3311,7 +3316,7 @@
     "+r"(width)        // %3
   : "r"(1),            // %4
     "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_SOBELYROW_NEON

diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 8eb8889..f58fc51 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc

@@ -10,7 +10,7 @@
 
 #include "libyuv/row.h"
 
-#if defined (_M_X64)
+#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
 #include <emmintrin.h>
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
 #endif
@@ -78,7 +78,6 @@
                          const uint8* v_buf,
                          uint8* dst_argb,
                          int width) {
-
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const __m128i xmm4 = _mm_setzero_si128();
@@ -132,7 +131,6 @@
                                    const uint8* v_buf,
                                    uint8* dst_argb,
                                    int width) {
-
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const __m128i xmm4 = _mm_setzero_si128();

diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..64c7d10
--- /dev/null
+++ b/third_party/libyuv/source/scale_neon64.cc

@@ -0,0 +1,790 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+#endif //ScaleRowDown34_0_Box_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#if 0
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+#endif //0
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_SCALEARGBROWDOWNEVEN_NEON
+#endif  // __aarch64__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/tools_common.h b/tools_common.h
index 558413e..c1f466b 100644
--- a/tools_common.h
+++ b/tools_common.h

@@ -103,17 +103,25 @@
 extern "C" {
 #endif
 
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...);
-void fatal(const char *fmt, ...);
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
 void warn(const char *fmt, ...);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
 
 /* The tool including this file must define usage_exit() */
-void usage_exit();
+void usage_exit() VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index b05ad14..d48c4fe 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h

@@ -224,7 +224,7 @@
         int arnr_strength;
         int arnr_type;
 
-        struct vpx_fixed_buf        two_pass_stats_in;
+        vpx_fixed_buf_t        two_pass_stats_in;
         struct vpx_codec_pkt_list  *output_pkt_list;
 
         vp8e_tuning tuning;

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 204cbf0..902a83a 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -446,14 +446,12 @@
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
 $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
-$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
 $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
-$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
 specialize qw/vp8_short_walsh4x4 sse2 media neon/;

diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
deleted file mode 100644
index 5ea8dd8..0000000
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null

@@ -1,221 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_fdct4x4_neon|
-    EXPORT  |vp8_short_fdct8x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
-    ALIGN 16    ; enable use of @128 bit aligned loads
-coeff
-    DCW      5352,  5352,  5352, 5352
-    DCW      2217,  2217,  2217, 2217
-    DCD     14500, 14500, 14500, 14500
-    DCD      7500,  7500,  7500, 7500
-    DCD     12000, 12000, 12000, 12000
-    DCD     51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
-    ; Part one
-    vld1.16         {d0}, [r0@64], r2
-    adr             r12, coeff
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {d2}, [r0@64], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {d3}, [r0@64], r2
-
-    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
-    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
-    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
-    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
-    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
-
-    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
-    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
-    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-
-    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vmov.s16        d26, #7
-
-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
-    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
-    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
-    vadd.s16        d4, d4, d26     ; a1 + 7
-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
-
-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
-    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
-
-    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
-    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
-
-    vceq.s16        d4, d7, #0
-
-    vshr.s16        d0, d0, #4
-    vshr.s16        d2, d2, #4
-
-    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
-
-    vmvn            d4, d4
-    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
-    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
-    ; Part one
-
-    vld1.16         {q0}, [r0@128], r2
-    adr             r12, coeff
-    vld1.16         {q1}, [r0@128], r2
-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
-    vld1.16         {q2}, [r0@128], r2
-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
-    vld1.16         {q3}, [r0@128], r2
-
-    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
-    vtrn.32         q0, q2          ; [A0|B0]
-    vtrn.32         q1, q3          ; [A1|B1]
-    vtrn.16         q0, q1          ; [A2|B2]
-    vtrn.16         q2, q3          ; [A3|B3]
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
-    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
-    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
-
-    vshl.s16        q11, q11, #3    ; a1 << 3
-    vshl.s16        q12, q12, #3    ; b1 << 3
-    vshl.s16        q13, q13, #3    ; c1 << 3
-    vshl.s16        q14, q14, #3    ; d1 << 3
-
-    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
-    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
-
-    vmov.s16        q11, q9         ; 14500
-    vmov.s16        q12, q10        ; 7500
-
-    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
-    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
-    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
-    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
-
-    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
-    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
-    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
-
-    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
-    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
-    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
-
-
-    ; Part two
-    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
-
-    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
-    vtrn.32         q0, q2          ; q0=[A0 | B0]
-    vtrn.32         q1, q3          ; q1=[A4 | B4]
-    vtrn.16         q0, q1          ; q2=[A8 | B8]
-    vtrn.16         q2, q3          ; q3=[A12|B12]
-
-    vmov.s16        q15, #7
-
-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
-    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
-    vadd.s16        q11, q11, q15   ; a1 + 7
-    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
-
-    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
-    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
-
-    vmov.s16        q11, q9         ; 12000
-    vmov.s16        q12, q10        ; 51000
-
-    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
-    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
-    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
-
-
-    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
-    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
-    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
-    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
-
-    vceq.s16        q14, q14, #0
-
-    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
-    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
-    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
-
-    vmvn            q14, q14
-
-    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
-
-    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
-    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
-    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
-
-    vst1.16         {q0, q1}, [r1@128]! ; block A
-    vst1.16         {q2, q3}, [r1@128]! ; block B
-
-    bx              lr
-
-    ENDP
-
-    END
-

diff --git a/vp8/encoder/arm/neon/shortfdct_neon.c b/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000..391e5f9
--- /dev/null
+++ b/vp8/encoder/arm/neon/shortfdct_neon.c

@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+    uint16x4_t d4u16;
+    int16x8_t q0s16, q1s16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+    q11s32 = vdupq_n_s32(12000);
+    q12s32 = vdupq_n_s32(51000);
+
+    // Part one
+    pitch >>= 1;
+    d0s16 = vld1_s16(input);
+    input += pitch;
+    d1s16 = vld1_s16(input);
+    input += pitch;
+    d2s16 = vld1_s16(input);
+    input += pitch;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d4s16 = vshl_n_s16(d4s16, 3);
+    d5s16 = vshl_n_s16(d5s16, 3);
+    d6s16 = vshl_n_s16(d6s16, 3);
+    d7s16 = vshl_n_s16(d7s16, 3);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+    q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 12);
+    d3s16 = vshrn_n_s32(q10s32, 12);
+
+    // Part two
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d26s16 = vdup_n_s16(7);
+    d4s16 = vadd_s16(d4s16, d26s16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+    dEmptys16 = vdup_n_s16(0);
+    d4u16 = vceq_s16(d7s16, dEmptys16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+
+    q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+    d4u16 = vmvn_u16(d4u16);
+    d1s16 = vshrn_n_s32(q11s32, 16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+    d3s16 = vshrn_n_s32(q12s32, 16);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+
+void vp8_short_fdct8x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+    uint16x4_t d28u16, d29u16;
+    uint16x8_t q14u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x8x2_t v2tmp0, v2tmp1;
+    int32x4x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+
+    // Part one
+    pitch >>= 1;
+    q0s16 = vld1q_s16(input);
+    input += pitch;
+    q1s16 = vld1q_s16(input);
+    input += pitch;
+    q2s16 = vld1q_s16(input);
+    input += pitch;
+    q3s16 = vld1q_s16(input);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q11s16 = vshlq_n_s16(q11s16, 3);
+    q12s16 = vshlq_n_s16(q12s16, 3);
+    q13s16 = vshlq_n_s16(q13s16, 3);
+    q14s16 = vshlq_n_s16(q14s16, 3);
+
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q2s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d2s16 = vshrn_n_s32(q9s32, 12);
+    d6s16 = vshrn_n_s32(q10s32, 12);
+    d3s16 = vshrn_n_s32(q11s32, 12);
+    d7s16 = vshrn_n_s32(q12s32, 12);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    // Part two
+    q9s32 = vdupq_n_s32(12000);
+    q10s32 = vdupq_n_s32(51000);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q15s16 = vdupq_n_s16(7);
+    q11s16 = vaddq_s16(q11s16, q15s16);
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q1s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d2s16 = vget_low_s16(q1s16);
+    d3s16 = vget_high_s16(q1s16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d4s16 = vshr_n_s16(d1s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+    d6s16 = vshr_n_s16(d3s16, 4);
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 16);
+    d3s16 = vshrn_n_s32(q10s32, 16);
+    d5s16 = vshrn_n_s32(q11s32, 16);
+    d7s16 = vshrn_n_s32(q12s32, 16);
+
+    qEmptys16 = vdupq_n_s16(0);
+    q14u16 = vceqq_s16(q14s16, qEmptys16);
+    q14u16 = vmvnq_u16(q14u16);
+
+    d28u16 = vget_low_u16(q14u16);
+    d29u16 = vget_high_u16(q14u16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+    d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    vst1q_s16(output + 16, q2s16);
+    vst1q_s16(output + 24, q3s16);
+    return;
+}

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 0c522bd..1654cd8 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -39,40 +39,28 @@
 
 };
 
-struct extraconfig_map
-{
-    int                 usage;
-    struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] =
-{
-    {
-        0,
-        {
-            NULL,
+static struct vp8_extracfg default_extracfg = {
+  NULL,
 #if !(CONFIG_REALTIME_ONLY)
-            0,                          /* cpu_used      */
+  0,                          /* cpu_used      */
 #else
-            4,                          /* cpu_used      */
+  4,                          /* cpu_used      */
 #endif
-            0,                          /* enable_auto_alt_ref */
-            0,                          /* noise_sensitivity */
-            0,                          /* Sharpness */
-            0,                          /* static_thresh */
+  0,                          /* enable_auto_alt_ref */
+  0,                          /* noise_sensitivity */
+  0,                          /* Sharpness */
+  0,                          /* static_thresh */
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
-            VP8_EIGHT_TOKENPARTITION,
+  VP8_EIGHT_TOKENPARTITION,
 #else
-            VP8_ONE_TOKENPARTITION,     /* token_partitions */
+  VP8_ONE_TOKENPARTITION,     /* token_partitions */
 #endif
-            0,                          /* arnr_max_frames */
-            3,                          /* arnr_strength */
-            3,                          /* arnr_type*/
-            0,                          /* tuning*/
-            10,                         /* cq_level */
-            0,                          /* rc_max_intra_bitrate_pct */
-        }
-    }
+  0,                          /* arnr_max_frames */
+  3,                          /* arnr_strength */
+  3,                          /* arnr_type*/
+  0,                          /* tuning*/
+  10,                         /* cq_level */
+  0,                          /* rc_max_intra_bitrate_pct */
 };
 
 struct vpx_codec_alg_priv
@@ -632,9 +620,6 @@
 {
     vpx_codec_err_t        res = VPX_CODEC_OK;
     struct vpx_codec_alg_priv *priv;
-    vpx_codec_enc_cfg_t       *cfg;
-    unsigned int               i;
-
     struct VP8_COMP *optr;
 
     vp8_rtcd();
@@ -650,7 +635,6 @@
 
         ctx->priv = &priv->base;
         ctx->priv->sz = sizeof(*ctx->priv);
-        ctx->priv->iface = ctx->iface;
         ctx->priv->alg_priv = priv;
         ctx->priv->init_flags = ctx->init_flags;
 
@@ -663,17 +647,8 @@
             ctx->config.enc = &ctx->priv->alg_priv->cfg;
         }
 
-        cfg =  &ctx->priv->alg_priv->cfg;
 
-        /* Select the extra vp8 configuration table based on the current
-         * usage value. If the current usage value isn't found, use the
-         * values for usage case 0.
-         */
-        for (i = 0;
-             extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-             i++);
-
-        priv->vp8_cfg = extracfg_map[i].cfg;
+        priv->vp8_cfg = default_extracfg;
         priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
 
         priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
@@ -1347,10 +1322,10 @@
     vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
     vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
     {
-        NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
-        NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
-        NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_peek_si_fn_t    peek_si; */
+        NULL,    /* vpx_codec_get_si_fn_t     get_si; */
+        NULL,    /* vpx_codec_decode_fn_t     decode; */
+        NULL,    /* vpx_codec_frame_get_fn_t  frame_get; */
     },
     {
         1,                  /* 1 cfg map */
@@ -1358,7 +1333,7 @@
         vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
         vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
         vp8e_set_config,
-        NOT_IMPLEMENTED,
+        NULL,
         vp8e_get_preview,
         vp8e_mr_alloc_mem,
     } /* encoder functions */

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index c76ac14..05f32ba 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -84,7 +84,6 @@
         (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
     vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
     ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
     ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
     ctx->priv->alg_priv->decrypt_cb = NULL;
@@ -815,15 +814,15 @@
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
-        NOT_IMPLEMENTED,
+        NULL,
     },
     { /* encoder functions */
         0,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED,
-        NOT_IMPLEMENTED
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL
     }
 };

diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 0b3eed0..eb94202 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk

@@ -37,10 +37,10 @@
 # encoder
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/shortfdct_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
 
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c

diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 3eb7f9d..5d89da8 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h

@@ -34,6 +34,14 @@
   int32_t col;
 } MV32;
 
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return  *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
 static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
                             int min_row, int max_row) {
   mv->col = clamp(mv->col, min_col, max_col);

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index abda4e6..e4e6ce7 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -366,6 +366,9 @@
                            unsigned int width, unsigned int height, int pitch) {
   unsigned int i, j;
 
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
   for (i = 0; i < height; i++) {
     uint8_t *pos = start + i * pitch;
     char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 708f41b..92f9318 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -305,15 +305,15 @@
 $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/;
 $vp9_convolve8_neon_asm=vp9_convolve8_neon;
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/;
 $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/;
 $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index d109e13..3bc7d39 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

@@ -307,7 +307,7 @@
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   unsigned int i;
   unsigned int src_stride, dst_stride;
@@ -409,35 +409,35 @@
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
      srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
-
 
      // multiply 2 adjacent elements with the filter and add the result
      srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-
-     // multiply 2 adjacent elements with the filter and add the result
      srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
 
      // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
-
-     // add and saturate the results together
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
 
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
 
      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
      srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index a0fff45..572ab0e 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -627,11 +627,12 @@
                        "Width and height beyond allowed size.");
 #endif
   if (cm->width != width || cm->height != height) {
-    const int new_rows = ALIGN_POWER_OF_TWO(height,
-                                            MI_SIZE_LOG2) >> MI_SIZE_LOG2;
-    const int new_cols = ALIGN_POWER_OF_TWO(width,
-                                            MI_SIZE_LOG2) >> MI_SIZE_LOG2;
-    if (calc_mi_size(new_rows) * calc_mi_size(new_cols) > cm->mi_alloc_size) {
+    const int new_mi_rows =
+        calc_mi_size(ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2);
+    const int new_mi_cols =
+        calc_mi_size(ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2);
+    if (new_mi_cols > cm->mi_stride ||
+        (new_mi_rows * new_mi_cols > cm->mi_alloc_size)) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index bdb1338..8ca05c1 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -1083,7 +1083,16 @@
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
   } else {
-    if (!cm->show_frame)
+    // In spatial svc if it's not error_resilient_mode then we need to code all
+    // visible frames as invisible. But we need to keep the show_frame flag so
+    // that the publisher could know whether it is supposed to be visible.
+    // So we will code the show_frame flag as it is. Then code the intra_only
+    // bit here. This will make the bitstream incompatible. In the player we
+    // will change to show_frame flag to 0, then add an one byte frame with
+    // show_existing_frame flag which tells the decoder which frame we want to
+    // show.
+    if (!cm->show_frame ||
+        (is_spatial_svc(cpi) && cm->error_resilient_mode == 0))
       vp9_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index bd3b0fd..46f463a 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -76,16 +76,12 @@
   int pred_mv_sad[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
   int *nmvcost[2];
-  int nmvcosts_hp[2][MV_VALS];
   int *nmvcost_hp[2];
   int **mvcost;
 
   int nmvjointsadcost[MV_JOINTS];
-  int nmvsadcosts[2][MV_VALS];
   int *nmvsadcost[2];
-  int nmvsadcosts_hp[2][MV_VALS];
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 90ea9cc..793a9da 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -370,8 +370,8 @@
   ctx->newmv_sse = UINT_MAX;
 }
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
-                                     unsigned int sse, PREDICTION_MODE mode,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+                                     PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
   // TODO(tkopp): Use both MVs if possible
   if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {

diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index d93846f..8a91492 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h

@@ -42,7 +42,7 @@
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
                                      unsigned int sse, PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 711354b..950a6c8 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1232,30 +1232,23 @@
   }
 }
 
-static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
+static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
                          int mi_row, int mi_col) {
-  MACROBLOCK *x = &cpi->mb;
-  uint8_t *src, *pre;
-  int src_stride, pre_stride;
-
+  // This assumes the input source frames are of the same dimension.
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
-
+  const int x = mi_col * MI_SIZE;
+  const int y = mi_row * MI_SIZE;
+  const int src_stride = cpi->Source->y_stride;
+  const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
+  const int pre_stride = cpi->Last_Source->y_stride;
+  const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
   int this_sad = 0;
   int threshold = 0;
 
-  // This assumes the input source frames are of the same dimension.
-  src_stride = cpi->Source->y_stride;
-  src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
-            (mi_col * MI_SIZE);
-  pre_stride = cpi->Last_Source->y_stride;
-  pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
-          (mi_col * MI_SIZE);
-
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride);
+    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
     threshold = (1 << 12);
   } else {
     int r, c;
@@ -1266,8 +1259,7 @@
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
-  x->in_static_area = (this_sad < 2 * threshold);
-  return x->in_static_area;
+  return this_sad < 2 * threshold;
 }
 
 static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
@@ -3119,7 +3111,7 @@
         break;
       case REFERENCE_PARTITION:
         if (sf->partition_check ||
-            !is_background(cpi, tile, mi_row, mi_col)) {
+            !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
           set_modeinfo_offsets(cm, xd, mi_row, mi_col);
           auto_partition_range(cpi, tile, mi_row, mi_col,
                                &sf->min_partition_size,
@@ -3297,7 +3289,6 @@
 
   vp9_zero(cm->counts);
   vp9_zero(cpi->coef_counts);
-  vp9_zero(cpi->tx_stepdown_count);
   vp9_zero(rd_opt->comp_pred_diff);
   vp9_zero(rd_opt->filter_diff);
   vp9_zero(rd_opt->tx_select_diff);

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ee0b99c..3210ab5 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -167,6 +167,26 @@
   vpx_free(cpi->complexity_map);
   cpi->complexity_map = NULL;
 
+  vpx_free(cpi->nmvcosts[0]);
+  vpx_free(cpi->nmvcosts[1]);
+  cpi->nmvcosts[0] = NULL;
+  cpi->nmvcosts[1] = NULL;
+
+  vpx_free(cpi->nmvcosts_hp[0]);
+  vpx_free(cpi->nmvcosts_hp[1]);
+  cpi->nmvcosts_hp[0] = NULL;
+  cpi->nmvcosts_hp[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts[0]);
+  vpx_free(cpi->nmvsadcosts[1]);
+  cpi->nmvsadcosts[0] = NULL;
+  cpi->nmvsadcosts[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts_hp[0]);
+  vpx_free(cpi->nmvsadcosts_hp[1]);
+  cpi->nmvsadcosts_hp[0] = NULL;
+  cpi->nmvsadcosts_hp[1] = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -212,8 +232,15 @@
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
   vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
+
+  vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+             MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+             MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+             MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+             MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
@@ -233,8 +260,15 @@
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
   vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+  vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
+             MV_VALS * sizeof(*cc->nmvcosts[0]));
+  vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
+             MV_VALS * sizeof(*cc->nmvcosts[1]));
+  vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+             MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+             MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
@@ -715,11 +749,6 @@
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
-
-  cpi->gold_is_last = 0;
-  cpi->alt_is_last = 0;
-  cpi->gold_is_alt = 0;
-
   cpi->skippable_frame = 0;
 
   // Create the encoder segmentation map and set all entries to 0
@@ -739,6 +768,23 @@
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                    sizeof(cpi->mbgraph_stats[0])); i++) {
     CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@@ -819,16 +865,16 @@
   cpi->first_time_stamp_ever = INT64_MAX;
 
   cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+  cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
   cal_nmvsadcosts(cpi->mb.nmvsadcost);
 
-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+  cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -845,8 +891,6 @@
   kf_list = fopen("kf_list.stt", "w");
 #endif
 
-  cpi->output_pkt_list = oxcf->output_pkt_list;
-
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
   if (oxcf->pass == 1) {
@@ -1900,36 +1944,27 @@
   } while (loop);
 }
 
-static void get_ref_frame_flags(VP9_COMP *cpi) {
-  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_last = 0;
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 0;
-
-  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 1;
-  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 0;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (cpi->gold_is_last)
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+  if (gold_is_last)
+    flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
       !is_spatial_svc(cpi))
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+    flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->alt_is_last)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+  if (alt_is_last)
+    flags &= ~VP9_ALT_FLAG;
 
-  if (cpi->gold_is_alt)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+  if (gold_is_alt)
+    flags &= ~VP9_ALT_FLAG;
+
+  return flags;
 }
 
 static void set_ext_overrides(VP9_COMP *cpi) {
@@ -2027,6 +2062,7 @@
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
   int q;
@@ -2074,9 +2110,7 @@
     // The alternate reference frame cannot be active for a key frame.
     cpi->rc.source_alt_ref_active = 0;
 
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
 
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
@@ -2084,29 +2118,43 @@
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
     } else if (cm->intra_only) {
+      cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
       // Only reset the current context.
       cm->reset_frame_context = 2;
     }
   }
+  if (is_spatial_svc(cpi) && cm->error_resilient_mode == 0) {
+    cm->frame_context_idx = cpi->svc.spatial_layer_id;
+
+    // The probs will be updated based on the frame type of its previous
+    // frame if frame_parallel_decoding_mode is 0. The type may vary for
+    // the frame after a key frame in base layer since we may drop enhancement
+    // layers. So set frame_parallel_decoding_mode to 1 in this case.
+    if (cpi->svc.spatial_layer_id == 0 &&
+        cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+      cm->frame_parallel_decoding_mode = 1;
+    else
+      cm->frame_parallel_decoding_mode = 0;
+  }
 
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
-  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
 
   // Check if the current frame is skippable for the partition search in the
   // second pass according to the first pass stats
-  if (cpi->oxcf.pass == 2 &&
+  if (oxcf->pass == 2 &&
       (!cpi->use_svc || is_spatial_svc(cpi))) {
     configure_skippable_frame(cpi);
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_CBR &&
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (vp9_rc_drop_frame(cpi)) {
       vp9_rc_postencode_update_drop_frame(cpi);
@@ -2118,9 +2166,9 @@
   vp9_clear_system_state();
 
 #if CONFIG_VP9_POSTPROC
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (oxcf->noise_sensitivity > 0) {
     int l = 0;
-    switch (cpi->oxcf.noise_sensitivity) {
+    switch (oxcf->noise_sensitivity) {
       case 1:
         l = 20;
         break;
@@ -2161,7 +2209,7 @@
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (oxcf->noise_sensitivity > 0) {
     vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                             yuv_denoised_file);
   }
@@ -2236,7 +2284,7 @@
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-  get_ref_frame_flags(cpi);
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
   cm->last_frame_type = cm->frame_type;
   vp9_rc_postencode_update(cpi, *size);
@@ -2263,8 +2311,12 @@
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  if (!cm->show_existing_frame)
-    cm->last_show_frame = cm->show_frame;
+  if (!cm->show_existing_frame) {
+    if (is_spatial_svc(cpi) && cm->error_resilient_mode == 0)
+      cm->last_show_frame = 0;
+    else
+      cm->last_show_frame = cm->show_frame;
+  }
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
@@ -2275,6 +2327,10 @@
     if (cpi->use_svc)
       vp9_inc_frame_in_layer(&cpi->svc);
   }
+
+  if (is_spatial_svc(cpi))
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
+        cm->frame_type;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -2389,8 +2445,8 @@
          cm->seg.update_data;
 }
 
-void adjust_frame_rate(VP9_COMP *cpi) {
-  const struct lookahead_entry *const source = cpi->source;
+void adjust_frame_rate(VP9_COMP *cpi,
+                       const struct lookahead_entry *source) {
   int64_t this_duration;
   int step = 0;
 
@@ -2446,7 +2502,8 @@
   return arf_src_index;
 }
 
-static void check_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi,
+                             const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   if (cpi->oxcf.pass == 2) {
@@ -2455,7 +2512,7 @@
       (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
   } else {
     rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
-                               (cpi->source == cpi->alt_ref_source);
+                               (source == cpi->alt_ref_source);
   }
 
   if (rc->is_src_frame_alt_ref) {
@@ -2477,6 +2534,8 @@
   RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
   MV_REFERENCE_FRAME ref_frame;
   int arf_src_index;
 
@@ -2489,9 +2548,6 @@
 
   vpx_usec_timer_start(&cmptimer);
 
-  cpi->source = NULL;
-  cpi->last_source = NULL;
-
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
   // Normal defaults
@@ -2508,13 +2564,12 @@
 
 #if CONFIG_SPATIAL_SVC
     if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
-                                           arf_src_index, 0);
+      source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0);
     else
 #endif
-      cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
-    if (cpi->source != NULL) {
-      cpi->alt_ref_source = cpi->source;
+      source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
+    if (source != NULL) {
+      cpi->alt_ref_source = source;
 
 #if CONFIG_SPATIAL_SVC
       if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
@@ -2548,46 +2603,44 @@
     }
   }
 
-  if (!cpi->source) {
+  if (!source) {
     // Get last frame source.
     if (cm->current_video_frame > 0) {
 #if CONFIG_SPATIAL_SVC
       if (is_spatial_svc(cpi))
-        cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
+        last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
       else
 #endif
-        cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
-      if (cpi->last_source == NULL)
+        last_source = vp9_lookahead_peek(cpi->lookahead, -1);
+      if (last_source == NULL)
         return -1;
     }
 
     // Read in the source frame.
 #if CONFIG_SPATIAL_SVC
     if (is_spatial_svc(cpi))
-      cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
 #endif
-      cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
-    if (cpi->source != NULL) {
+      source = vp9_lookahead_pop(cpi->lookahead, flush);
+    if (source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
 
       // Check to see if the frame should be encoded as an arf overlay.
-      check_src_altref(cpi);
+      check_src_altref(cpi, source);
     }
   }
 
-  if (cpi->source) {
+  if (source) {
     cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
-                                                           : &cpi->source->img;
+                                                           : &source->img;
 
-    cpi->unscaled_last_source = cpi->last_source != NULL ?
-                                    &cpi->last_source->img : NULL;
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
 
-    *time_stamp = cpi->source->ts_start;
-    *time_end = cpi->source->ts_end;
-    *frame_flags =
-        (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
   } else {
     *size = 0;
@@ -2598,9 +2651,9 @@
     return -1;
   }
 
-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
   }
 
   // Clear down mmx registers
@@ -2608,7 +2661,7 @@
 
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
-    adjust_frame_rate(cpi);
+    adjust_frame_rate(cpi, source);
   }
 
   if (cpi->svc.number_temporal_layers > 1 &&
@@ -2680,7 +2733,7 @@
     const int lossless = is_lossless_requested(oxcf);
     cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-    vp9_first_pass(cpi);
+    vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 &&
       (!cpi->use_svc || is_spatial_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
@@ -2758,12 +2811,12 @@
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
 
-          frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
-          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
 
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7226a78..b1b12a1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -208,11 +208,11 @@
   int tile_columns;
   int tile_rows;
 
-  struct vpx_fixed_buf         two_pass_stats_in;
-  struct vpx_codec_pkt_list  *output_pkt_list;
+  vpx_fixed_buf_t two_pass_stats_in;
+  struct vpx_codec_pkt_list *output_pkt_list;
 
 #if CONFIG_FP_MB_STATS
-  struct vpx_fixed_buf         firstpass_mb_stats_in;
+  vpx_fixed_buf_t firstpass_mb_stats_in;
 #endif
 
   vp8e_tuning tuning;
@@ -223,19 +223,13 @@
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
-static INLINE int is_best_mode(MODE mode) {
-  return mode == BEST;
-}
-
 typedef struct VP9_COMP {
   QUANTS quants;
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
-  struct lookahead_entry  *source;
   struct lookahead_entry  *alt_ref_source;
-  struct lookahead_entry  *last_source;
 
   YV12_BUFFER_CONFIG *Source;
   YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
@@ -244,10 +238,6 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt same as last ( short circuit altref search)
-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
-
   int skippable_frame;
 
   int scaled_ref_idx[3];
@@ -279,6 +269,11 @@
 
   CODING_CONTEXT coding_context;
 
+  int *nmvcosts[2];
+  int *nmvcosts_hp[2];
+  int *nmvsadcosts[2];
+  int *nmvsadcosts_hp[2];
+
   int zbin_mode_boost;
   int zbin_mode_boost_enabled;
 
@@ -377,8 +372,6 @@
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int tx_stepdown_count[TX_SIZES];
-
   int initial_width;
   int initial_height;
 

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 94bbe9c..7867dc2 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -396,7 +396,7 @@
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-void vp9_first_pass(VP9_COMP *cpi) {
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
@@ -428,7 +428,7 @@
   int neutral_count = 0;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
-  uint32_t lastmv_as_int = 0;
+  MV lastmv = {0, 0};
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
@@ -512,9 +512,7 @@
   vp9_tile_init(&tile, cm, 0, 0);
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    int_mv best_ref_mv;
-
-    best_ref_mv.as_int = 0;
+    MV best_ref_mv = {0, 0};
 
     // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
@@ -594,14 +592,13 @@
       // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
         int tmp_err, motion_error, raw_motion_error;
-        int_mv mv, tmp_mv;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = {0, 0} , tmp_mv = {0, 0};
         struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
         motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                             &xd->plane[0].pre[0]);
-        // Assume 0,0 motion with no mv overhead.
-        mv.as_int = tmp_mv.as_int = 0;
 
         // Compute the motion error of the 0,0 motion using the last source
         // frame as the reference. Skip the further motion search on
@@ -617,8 +614,7 @@
         if (raw_motion_error > 25 || lc != NULL) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                   &motion_error);
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();
             motion_error = (int)(motion_error * error_weight);
@@ -626,9 +622,9 @@
 
           // If the current best reference mv is not centered on 0,0 then do a
           // 0,0 based search as well.
-          if (best_ref_mv.as_int) {
+          if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
             if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
               vp9_clear_system_state();
               tmp_err = (int)(tmp_err * error_weight);
@@ -636,7 +632,7 @@
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
-              mv.as_int = tmp_mv.as_int;
+              mv = tmp_mv;
             }
           }
 
@@ -649,7 +645,7 @@
             gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                    &xd->plane[0].pre[0]);
 
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
                                      &gf_motion_error);
             if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
               vp9_clear_system_state();
@@ -680,7 +676,8 @@
         }
 
         // Start by assuming that intra mode is best.
-        best_ref_mv.as_int = 0;
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
 
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
@@ -704,25 +701,25 @@
               this_error < 2 * intrapenalty)
             ++neutral_count;
 
-          mv.as_mv.row *= 8;
-          mv.as_mv.col *= 8;
+          mv.row *= 8;
+          mv.col *= 8;
           this_error = motion_error;
           xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0] = mv;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
           xd->mi[0]->mbmi.tx_size = TX_4X4;
           xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.as_mv.row;
-          sum_mvr_abs += abs(mv.as_mv.row);
-          sum_mvc += mv.as_mv.col;
-          sum_mvc_abs += abs(mv.as_mv.col);
-          sum_mvrs += mv.as_mv.row * mv.as_mv.row;
-          sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
           ++intercount;
 
-          best_ref_mv.as_int = mv.as_int;
+          best_ref_mv = mv;
 
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
@@ -740,7 +737,7 @@
           }
 #endif
 
-          if (mv.as_int) {
+          if (!is_zero_mv(&mv)) {
             ++mvcount;
 
 #if CONFIG_FP_MB_STATS
@@ -771,33 +768,33 @@
 #endif
 
             // Non-zero vector, was it different from the last non zero vector?
-            if (mv.as_int != lastmv_as_int)
+            if (!is_equal_mv(&mv, &lastmv))
               ++new_mv_count;
-            lastmv_as_int = mv.as_int;
+            lastmv = mv;
 
             // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.as_mv.row > 0)
+              if (mv.row > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.row < 0)
+              else if (mv.row < 0)
                 --sum_in_vectors;
             }
 
             // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 --sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.as_mv.col > 0)
+              if (mv.col > 0)
                 ++sum_in_vectors;
-              else if (mv.as_mv.col < 0)
+              else if (mv.col < 0)
                 --sum_in_vectors;
             }
           }
@@ -865,7 +862,7 @@
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
+    fps.duration = (double)(source->ts_end - source->ts_start);
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;

diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index bf8c9fd..aaa6b03 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h

@@ -121,7 +121,7 @@
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
-void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index e5469c8..7a7bb28 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -254,7 +254,8 @@
 }
 
 static void free_pred_buffer(PRED_BUFFER *p) {
-  p->in_use = 0;
+  if (p != NULL)
+    p->in_use = 0;
 }
 
 static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
@@ -655,8 +656,7 @@
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
       if (cpi->oxcf.noise_sensitivity > 0) {
-        vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
-                                        this_mode, ctx);
+        vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
       }
 #endif
 
@@ -671,8 +671,7 @@
         skip_txfm = x->skip_txfm[0];
 
         if (cpi->sf.reuse_inter_pred_sby) {
-          if (best_pred != NULL)
-            free_pred_buffer(best_pred);
+          free_pred_buffer(best_pred);
 
           best_pred = this_mode_pred;
         }
@@ -692,7 +691,8 @@
 
   // If best prediction is not in dst buf, then copy the prediction block from
   // temp buf to dst buf.
-  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
+  if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
+      best_pred->data != orig_dst.buf) {
     uint8_t *copy_from, *copy_to;
 
     pd->dst = orig_dst;

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 4fc3e9e..2841efa 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -462,7 +462,7 @@
 
   // Set baseline threshold values.
   for (i = 0; i < MAX_MODES; ++i)
-    rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
   rd->thresh_mult[THR_NEARESTMV] = 0;
   rd->thresh_mult[THR_NEARESTG] = 0;
@@ -548,7 +548,7 @@
   int i;
 
   for (i = 0; i < MAX_REFS; ++i)
-    rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode)  ? -500 : 0;
+    rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
 
   rd->thresh_mult_sub8x8[THR_LAST] += 2500;
   rd->thresh_mult_sub8x8[THR_GOLD] += 2500;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cfda964..e52620f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -468,7 +468,6 @@
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-  cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -551,16 +550,12 @@
 
   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-    cpi->tx_stepdown_count[0]++;
   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
-    cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -1371,7 +1366,7 @@
           if (best_rd < label_mv_thresh)
             break;
 
-          if (!is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1413,7 +1408,7 @@
                                           INT_MAX, 1);
 
           // Should we do a full search (best quality only)
-          if (is_best_mode(cpi->oxcf.mode)) {
+          if (cpi->oxcf.mode == BEST) {
             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 57835ec..04d80ba 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -180,8 +180,8 @@
 static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
-  const int frames_since_key =
-      cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
+  const int is_keyframe = cm->frame_type == KEY_FRAME;
+  const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
@@ -277,17 +277,16 @@
   }
 
   if (speed >= 5) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
-    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
-        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->use_quant_fp = !is_keyframe;
+    sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+                                                  : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
     sf->partition_check =
         (frames_since_key % sf->last_partitioning_redo_frequency == 1);
-    sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
-        (frames_since_key %
-            (sf->last_partitioning_redo_frequency << 1) == 1);
-    sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+    sf->force_frame_boost = is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
@@ -305,8 +304,7 @@
     sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
 
-    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
-        USE_LARGESTALL : USE_TX_8X8;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
 
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -316,6 +314,7 @@
 
     sf->mv.reduce_first_step_size = 1;
   }
+
   if (speed >= 7) {
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
@@ -324,10 +323,12 @@
         800 : 300;
     sf->elevate_newmv_thresh = 2500;
   }
+
   if (speed >= 12) {
     sf->elevate_newmv_thresh = 4000;
     sf->mv.subpel_force_stop = 2;
   }
+
   if (speed >= 13) {
     int i;
     sf->max_intra_bsize = BLOCK_32X32;
@@ -408,16 +409,14 @@
   sf->default_interp_filter = SWITCHABLE;
   sf->tx_size_search_breakout = 0;
 
-  if (oxcf->mode == REALTIME) {
+  if (oxcf->mode == REALTIME)
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
-  } else {
-    if (!is_best_mode(oxcf->mode))
-      set_good_speed_feature(cpi, cm, sf, oxcf->speed);
-  }
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
   cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = is_best_mode(oxcf->mode) ? vp9_full_range_search
-                                                     : vp9_diamond_search_sad;
+  cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
+                                               : vp9_diamond_search_sad;
   cpi->refining_search_sad = vp9_refining_search_sad;
 
 

diff --git a/vp9/encoder/vp9_ssim.c b/vp9/encoder/vp9_ssim.c
index 026e6a8..8435640 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c

@@ -95,7 +95,7 @@
   return ssim_total;
 }
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight) {
+                     double *weight) {
   double a, b, c;
   double ssimv;
 

diff --git a/vp9/encoder/vp9_ssim.h b/vp9/encoder/vp9_ssim.h
index a581c2c..d1dd1b7 100644
--- a/vp9/encoder/vp9_ssim.h
+++ b/vp9/encoder/vp9_ssim.h

@@ -18,7 +18,7 @@
 #include "vpx_scale/yv12config.h"
 
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
-                     int lumamask, double *weight);
+                     double *weight);
 
 double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1d9bdd8..9bd9792 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c

@@ -36,6 +36,7 @@
     int i;
     lc->current_video_frame_in_layer = 0;
     lc->layer_size = 0;
+    lc->last_frame_type = FRAME_TYPES;
     lrc->ni_av_qi = oxcf->worst_allowed_q;
     lrc->total_actual_bits = 0;
     lrc->total_target_vs_actual = 0;
@@ -106,9 +107,12 @@
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
-    lrc->starting_buffer_level = rc->starting_buffer_level * bitrate_alloc;
-    lrc->optimal_buffer_level = rc->optimal_buffer_level * bitrate_alloc;
-    lrc->maximum_buffer_size = rc->maximum_buffer_size * bitrate_alloc;
+    lrc->starting_buffer_level =
+        (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+    lrc->optimal_buffer_level =
+        (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+    lrc->maximum_buffer_size =
+        (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
     lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.

diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 801449b..d475d5f 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h

@@ -25,9 +25,10 @@
   double framerate;
   int avg_frame_size;
   TWO_PASS twopass;
-  struct vpx_fixed_buf rc_twopass_stats_in;
+  vpx_fixed_buf_t rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
   int is_key_frame;
+  FRAME_TYPE last_frame_type;
   vpx_svc_parameters_t svc_params_received;
   struct lookahead_entry  *alt_ref_source;
   int alt_ref_idx;

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 08f6849..5ff02d8 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -21,7 +21,6 @@
 #include "vp9/vp9_iface_common.h"
 
 struct vp9_extracfg {
-  struct vpx_codec_pkt_list *pkt_list;
   int                         cpu_used;  // available cpu percentage in 1/16
   unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
@@ -43,37 +42,26 @@
   vp9e_tune_content           content;
 };
 
-struct extraconfig_map {
-  unsigned int usage;
-  struct vp9_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
-  {
-    0,
-    { // NOLINT
-      NULL,
-      0,                          // cpu_used
-      1,                          // enable_auto_alt_ref
-      0,                          // noise_sensitivity
-      0,                          // sharpness
-      0,                          // static_thresh
-      0,                          // tile_columns
-      0,                          // tile_rows
-      7,                          // arnr_max_frames
-      5,                          // arnr_strength
-      3,                          // arnr_type
-      VP8_TUNE_PSNR,              // tuning
-      10,                         // cq_level
-      0,                          // rc_max_intra_bitrate_pct
-      0,                          // lossless
-      0,                          // frame_parallel_decoding_mode
-      NO_AQ,                      // aq_mode
-      0,                          // frame_periodic_delta_q
-      BITS_8,                     // Bit depth
-      VP9E_CONTENT_DEFAULT        // content
-    }
-  }
+static struct vp9_extracfg default_extra_cfg = {
+  0,                          // cpu_used
+  1,                          // enable_auto_alt_ref
+  0,                          // noise_sensitivity
+  0,                          // sharpness
+  0,                          // static_thresh
+  0,                          // tile_columns
+  0,                          // tile_rows
+  7,                          // arnr_max_frames
+  5,                          // arnr_strength
+  3,                          // arnr_type
+  VP8_TUNE_PSNR,              // tuning
+  10,                         // cq_level
+  0,                          // rc_max_intra_bitrate_pct
+  0,                          // lossless
+  0,                          // frame_parallel_decoding_mode
+  NO_AQ,                      // aq_mode
+  0,                          // frame_periodic_delta_q
+  BITS_8,                     // Bit depth
+  VP9E_CONTENT_DEFAULT        // content
 };
 
 struct vpx_codec_alg_priv {
@@ -188,6 +176,8 @@
     if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
       ERROR("Not enough ref buffers for svc alt ref frames");
   }
+  if (cfg->ss_number_layers > 3 && cfg->g_error_resilient == 0)
+    ERROR("Multiple frame contexts are not supported for more than 3 layers");
 #endif
 
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
@@ -335,7 +325,7 @@
   if (oxcf->init_framerate > 180)
     oxcf->init_framerate = 30;
 
-  oxcf->mode = BEST;
+  oxcf->mode = GOOD;
 
   switch (cfg->g_pass) {
     case VPX_RC_ONE_PASS:
@@ -393,7 +383,6 @@
   oxcf->sharpness              =  extra_cfg->sharpness;
 
   oxcf->two_pass_stats_in      =  cfg->rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  extra_cfg->pkt_list;
 
 #if CONFIG_FP_MB_STATS
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
@@ -659,8 +648,6 @@
   (void)data;
 
   if (ctx->priv == NULL) {
-    int i;
-    vpx_codec_enc_cfg_t *cfg;
     struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv));
 
     if (priv == NULL)
@@ -668,7 +655,6 @@
 
     ctx->priv = &priv->base;
     ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
     ctx->priv->alg_priv = priv;
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->enc.total_encoders = 1;
@@ -679,17 +665,7 @@
       ctx->config.enc = &ctx->priv->alg_priv->cfg;
     }
 
-    cfg = &ctx->priv->alg_priv->cfg;
-
-    // Select the extra vp6 configuration table based on the current
-    // usage value. If the current usage value isn't found, use the
-    // values for usage case 0.
-    for (i = 0;
-         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         ++i) {}
-
-    priv->extra_cfg = extracfg_map[i].cfg;
-    priv->extra_cfg.pkt_list = &priv->pkt_list.head;
+    priv->extra_cfg = default_extra_cfg;
 
     vp9_initialize_enc();
 
@@ -701,10 +677,12 @@
                          &ctx->priv->alg_priv->cfg,
                          &ctx->priv->alg_priv->extra_cfg);
       cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-      if (cpi == NULL)
+      if (cpi == NULL) {
         res = VPX_CODEC_MEM_ERROR;
-      else
+      } else {
+        cpi->output_pkt_list = &priv->pkt_list.head;
         ctx->priv->alg_priv->cpi = cpi;
+      }
     }
   }
 
@@ -934,7 +912,7 @@
 #endif
 
         // Pack invisible frames with the next visible frame
-        if (cpi->common.show_frame == 0
+        if (!cpi->common.show_frame
 #if CONFIG_SPATIAL_SVC
             || (is_spatial_svc(cpi) &&
                 cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
@@ -966,18 +944,6 @@
             )
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
 
-        if (cpi->common.show_frame == 0) {
-          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
-          // This timestamp should be as close as possible to the
-          // prior PTS so that if a decoder uses pts to schedule when
-          // to do this, we start right after last frame was decoded.
-          // Invisible frames have no duration.
-          pkt.data.frame.pts =
-              ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
-          pkt.data.frame.duration = 0;
-        }
-
         if (cpi->droppable)
           pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
 
@@ -1002,8 +968,9 @@
         cx_data_sz -= size;
 #if CONFIG_SPATIAL_SVC
         if (is_spatial_svc(cpi)) {
-          vpx_codec_cx_pkt_t pkt = {0};
+          vpx_codec_cx_pkt_t pkt;
           int i;
+          vp9_zero(pkt);
           pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
           for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
             pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
@@ -1355,11 +1322,11 @@
   encoder_destroy,    // vpx_codec_destroy_fn_t
   encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
   {  // NOLINT
-    NOT_IMPLEMENTED,  // vpx_codec_peek_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_si_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_decode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_frame_get_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_set_fb_fn_t
+    NULL,  // vpx_codec_peek_si_fn_t
+    NULL,  // vpx_codec_get_si_fn_t
+    NULL,  // vpx_codec_decode_fn_t
+    NULL,  // vpx_codec_frame_get_fn_t
+    NULL   // vpx_codec_set_fb_fn_t
   },
   {  // NOLINT
     1,                      // 1 cfg map
@@ -1367,8 +1334,8 @@
     encoder_encode,         // vpx_codec_encode_fn_t
     encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
     encoder_set_config,     // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,        // vpx_codec_get_global_headers_fn_t
+    NULL,        // vpx_codec_get_global_headers_fn_t
     encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED         // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL         // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index ee82905..05d61c8 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -66,7 +66,6 @@
 
     ctx->priv = (vpx_codec_priv_t *)alg_priv;
     ctx->priv->sz = sizeof(*ctx->priv);
-    ctx->priv->iface = ctx->iface;
     ctx->priv->alg_priv = alg_priv;
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
     ctx->priv->init_flags = ctx->init_flags;
@@ -648,12 +647,12 @@
   },
   { // NOLINT
     0,
-    NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
-    NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_enc_config_set_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_global_headers_fn_t
-    NOT_IMPLEMENTED,  // vpx_codec_get_preview_frame_fn_t
-    NOT_IMPLEMENTED   // vpx_codec_enc_mr_get_mem_loc_fn_t
+    NULL,  // vpx_codec_enc_cfg_map_t
+    NULL,  // vpx_codec_encode_fn_t
+    NULL,  // vpx_codec_get_cx_data_fn_t
+    NULL,  // vpx_codec_enc_config_set_fn_t
+    NULL,  // vpx_codec_get_global_headers_fn_t
+    NULL,  // vpx_codec_get_preview_frame_fn_t
+    NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
   }
 };

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 0f5ce3d..d7c5800 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h

@@ -286,8 +286,6 @@
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-#define NOT_IMPLEMENTED 0
-
 /*!\brief Decoder algorithm interface interface
  *
  * All decoders \ref MUST expose a variable of this type.
@@ -338,7 +336,6 @@
  */
 struct vpx_codec_priv {
   unsigned int                    sz;
-  vpx_codec_iface_t              *iface;
   struct vpx_codec_alg_priv      *alg_priv;
   const char                     *err_detail;
   vpx_codec_flags_t               init_flags;
@@ -347,7 +344,7 @@
     vpx_codec_priv_cb_pair_t    put_slice_cb;
   } dec;
   struct {
-    struct vpx_fixed_buf        cx_data_dst_buf;
+    vpx_fixed_buf_t             cx_data_dst_buf;
     unsigned int                cx_data_pad_before;
     unsigned int                cx_data_pad_after;
     vpx_codec_cx_pkt_t          cx_data_pkt;

diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 7828615..45b0dca 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c

@@ -86,6 +86,7 @@
   int layers;
   int layer;
   int is_keyframe;
+  int use_multiple_frame_contexts;
 
   FrameData *frame_list;
   FrameData *frame_temp;
@@ -366,6 +367,7 @@
   char *option_name;
   char *option_value;
   char *input_ptr;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (options == NULL) return VPX_CODEC_OK;
@@ -393,6 +395,8 @@
     } else if (strcmp("auto-alt-refs", option_name) == 0) {
       res = parse_auto_alt_ref(svc_ctx, option_value);
       if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+      si->use_multiple_frame_contexts = atoi(option_value);
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -401,6 +405,10 @@
     option_name = strtok_r(NULL, "=", &input_ptr);
   }
   free(input_string);
+
+  if (si->use_multiple_frame_contexts && svc_ctx->spatial_layers > 3)
+    res = VPX_CODEC_INVALID_PARAM;
+
   return res;
 }
 
@@ -534,7 +542,8 @@
   enc_cfg->rc_buf_initial_sz = 500;
   enc_cfg->rc_buf_optimal_sz = 600;
   enc_cfg->rc_buf_sz = 1000;
-  enc_cfg->g_error_resilient = 1;
+  if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+    enc_cfg->g_error_resilient = 1;
 
   // Initialize codec
   res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);

diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 4d22a08..fdcc9c1 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c

@@ -20,7 +20,7 @@
 
 vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_dec_cfg_t  *cfg,
+                                       const vpx_codec_dec_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -54,9 +54,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);

diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 6e18bd1..736a8da 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c

@@ -22,7 +22,7 @@
 
 vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                        vpx_codec_iface_t    *iface,
-                                       vpx_codec_enc_cfg_t  *cfg,
+                                       const vpx_codec_enc_cfg_t *cfg,
                                        vpx_codec_flags_t     flags,
                                        int                   ver) {
   vpx_codec_err_t res;
@@ -53,9 +53,6 @@
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
-
-    if (ctx->priv)
-      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -135,9 +132,6 @@
           }
         }
 
-        if (ctx->priv)
-          ctx->priv->iface = ctx->iface;
-
         if (res)
           break;
 

diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 07df72a..91fc532 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h

@@ -203,9 +203,11 @@
     const char              *err_detail;  /**< Detailed info, if available */
     vpx_codec_flags_t        init_flags;  /**< Flags passed at init time */
     union {
-      struct vpx_codec_dec_cfg  *dec;   /**< Decoder Configuration Pointer */
-      struct vpx_codec_enc_cfg  *enc;   /**< Encoder Configuration Pointer */
-      void                      *raw;
+      /**< Decoder Configuration Pointer */
+      const struct vpx_codec_dec_cfg *dec;
+      /**< Encoder Configuration Pointer */
+      const struct vpx_codec_enc_cfg *enc;
+      const void                     *raw;
     }                        config;      /**< Configuration pointer aliasing union */
     vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
   } vpx_codec_ctx_t;

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 10b89fa..62fd919 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -135,7 +135,7 @@
    */
   vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_dec_cfg_t  *cfg,
+                                         const vpx_codec_dec_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 7dbbf2f..75d3a47 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -188,14 +188,14 @@
                                               has id 0.*/
 
       } frame;  /**< data for compressed frame packet */
-      struct vpx_fixed_buf twopass_stats;  /**< data for two-pass packet */
-      struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */
+      vpx_fixed_buf_t twopass_stats;  /**< data for two-pass packet */
+      vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
       struct vpx_psnr_pkt {
         unsigned int samples[4];  /**< Number of samples, total/y/u/v */
         uint64_t     sse[4];      /**< sum squared error, total/y/u/v */
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
-      struct vpx_fixed_buf raw;     /**< data for arbitrary packets */
+      vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
 #if CONFIG_SPATIAL_SVC
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
 #endif
@@ -452,14 +452,14 @@
      * A buffer containing all of the stats packets produced in the first
      * pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_twopass_stats_in;
+    vpx_fixed_buf_t   rc_twopass_stats_in;
 
     /*!\brief first pass mb stats buffer.
      *
      * A buffer containing all of the first pass mb stats packets produced
      * in the first pass, concatenated.
      */
-    struct vpx_fixed_buf   rc_firstpass_mb_stats_in;
+    vpx_fixed_buf_t   rc_firstpass_mb_stats_in;
 
     /*!\brief Target data rate
      *
@@ -715,7 +715,7 @@
    */
   vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
                                          vpx_codec_iface_t    *iface,
-                                         vpx_codec_enc_cfg_t  *cfg,
+                                         const vpx_codec_enc_cfg_t *cfg,
                                          vpx_codec_flags_t     flags,
                                          int                   ver);
 

diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index 870338b..dd98e29 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h

@@ -11,6 +11,9 @@
 
 #ifndef VPX_PORTS_VPX_TIMER_H_
 #define VPX_PORTS_VPX_TIMER_H_
+
+#include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
 
 #if CONFIG_OS_SUPPORT

diff --git a/vpxdec.c b/vpxdec.c
index fceb046..4125044 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -599,7 +599,8 @@
       do_scale = 1;
     else if (arg_match(&arg, &fb_arg, argi))
       num_external_frame_buffers = arg_parse_uint(&arg);
-
+    else if (arg_match(&arg, &continuearg, argi))
+      keep_going = 1;
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -649,11 +650,8 @@
       }
     } else if (arg_match(&arg, &error_concealment, argi)) {
       ec_enabled = 1;
-    } else if (arg_match(&arg, &continuearg, argi)) {
-      keep_going = 1;
     }
-
-#endif
+#endif  // CONFIG_VP8_DECODER
     else
       argj++;
   }

diff --git a/vpxenc.c b/vpxenc.c
index b99e61a..d8fc553 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -740,8 +740,9 @@
 #if CONFIG_VP9_ENCODER
     // Make default VP9 passes = 2 until there is a better quality 1-pass
     // encoder
-    global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
-                      global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                        global->deadline != VPX_DL_REALTIME) ? 2 : 1;
 #else
     global->passes = 1;
 #endif
@@ -809,8 +810,10 @@
   struct stream_state *stream;
 
   stream = calloc(1, sizeof(*stream));
-  if (!stream)
+  if (stream == NULL) {
     fatal("Failed to allocate new stream.");
+  }
+
   if (prev) {
     memcpy(stream, prev, sizeof(*stream));
     stream->index++;
@@ -996,12 +999,13 @@
           * instance of this control.
           */
           for (j = 0; j < config->arg_ctrl_cnt; j++)
-            if (config->arg_ctrls[j][0] == ctrl_args_map[i])
+            if (ctrl_args_map != NULL &&
+                config->arg_ctrls[j][0] == ctrl_args_map[i])
               break;
 
           /* Update/insert */
           assert(j < (int)ARG_CTRL_CNT_MAX);
-          if (j < (int)ARG_CTRL_CNT_MAX) {
+          if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
             config->arg_ctrls[j][0] = ctrl_args_map[i];
             config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
             if (j == config->arg_ctrl_cnt)
@@ -1788,7 +1792,8 @@
         got_data = 0;
         FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
 
-        if (!got_data && input.length && !streams->frames_out) {
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
           lagged_count = global.limit ? seen_frames : ftello(input.file);
         } else if (input.length) {
           int64_t remaining;

diff --git a/y4minput.c b/y4minput.c
index 520c332..bcc742a 100644
--- a/y4minput.c
+++ b/y4minput.c

@@ -700,7 +700,7 @@
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
                    int only_420) {
-  char buffer[80];
+  char buffer[80] = {0};
   int  ret;
   int  i;
   /*Read until newline, or 80 cols, whichever happens first.*/
@@ -978,7 +978,9 @@
     _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
   else
     _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
-  _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
   return 0;
 }