Merge "Added three test vectors with droppable frames"

diff --git a/docs.mk b/docs.mk
index 9426f76..797b466 100644
--- a/docs.mk
+++ b/docs.mk

@@ -30,7 +30,9 @@
 
 
 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
+EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
 
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
 doxyfile: libs.doxy_template libs.doxy
 	@echo "    [CREATE] $@"
 	@cat $^ > $@

diff --git a/examples.mk b/examples.mk
index 5f12b6b..40756e1 100644
--- a/examples.mk
+++ b/examples.mk

@@ -137,6 +137,10 @@
 error_resilient.DESCRIPTION      = Error Resiliency Feature
 
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8_set_maps.c
+vp8_set_maps.SRCS                  += ivfenc.h ivfenc.c
+vp8_set_maps.SRCS                  += tools_common.h tools_common.c
+vp8_set_maps.SRCS                  += video_common.h
+vp8_set_maps.SRCS                  += video_writer.h video_writer.c
 vp8_set_maps.GUID                   = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 vp8_set_maps.DESCRIPTION            = VP8 set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
@@ -281,3 +285,36 @@
                                $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
 $(foreach proj,$(call enabled,PROJECTS),\
     $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+samples.dox: examples.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page samples Sample Code" > $@
+	@echo "    This SDK includes a number of sample applications."\
+	      "Each sample documents a feature of the SDK in both prose"\
+	      "and the associated C code."\
+	      "The following samples are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo >> $@
+	@echo "    In addition, the SDK contains a number of utilities."\
+              "Since these utilities are built upon the concepts described"\
+              "in the sample code listed above, they are not documented in"\
+              "pieces like the samples are. Their source is included here"\
+              "for reference. The following utilities are included:" >> $@
+	@$(foreach ex,$(sort $(UTILS:.c=)),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
+DOCS-yes += examples.doxy samples.dox
+examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
+	@echo "INPUT += $^" > $@

diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index aabac60..28d1ad5 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c

@@ -115,7 +115,7 @@
     size_t frame_size = 0;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {

diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index c6f7d43..af1aa63 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c

@@ -120,7 +120,7 @@
     int skip;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
     ++frame_cnt;

diff --git a/examples/error_resilient.c b/examples/error_resilient.c
index ef0a6c38..19235c8 100644
--- a/examples/error_resilient.c
+++ b/examples/error_resilient.c

@@ -118,7 +118,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 

diff --git a/examples/force_keyframe.c b/examples/force_keyframe.c
index f03b3d0..6531e47 100644
--- a/examples/force_keyframe.c
+++ b/examples/force_keyframe.c

@@ -119,7 +119,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 

diff --git a/examples/postproc.c b/examples/postproc.c
index 2912fe6..be08e92 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c

@@ -118,7 +118,7 @@
     };
 
     // Decode the frame with 15ms deadline
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 15000))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
       die_codec(&codec, "Failed to decode frame");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {

diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index b0ca77d..8c15051 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c

@@ -134,7 +134,7 @@
     size_t frame_size = 0;
     const unsigned char *frame = vpx_video_reader_get_frame(reader,
                                                             &frame_size);
-    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
     while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {

diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index f16db66..8bca18c 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c

@@ -69,9 +69,9 @@
 static void get_frame_stats(vpx_codec_ctx_t *ctx,
                             const vpx_image_t *img,
                             vpx_codec_pts_t pts,
-                            uint64_t duration,
+                            unsigned int duration,
                             vpx_enc_frame_flags_t flags,
-                            uint64_t deadline,
+                            unsigned int deadline,
                             vpx_fixed_buf_t *stats) {
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
@@ -94,9 +94,9 @@
 static void encode_frame(vpx_codec_ctx_t *ctx,
                          const vpx_image_t *img,
                          vpx_codec_pts_t pts,
-                         uint64_t duration,
+                         unsigned int duration,
                          vpx_enc_frame_flags_t flags,
-                         uint64_t deadline,
+                         unsigned int deadline,
                          VpxVideoWriter *writer) {
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;

diff --git a/examples/vp8_set_maps.c b/examples/vp8_set_maps.c
index 4c0e8a0..f3cc9a7 100644
--- a/examples/vp8_set_maps.c
+++ b/examples/vp8_set_maps.c

@@ -44,253 +44,197 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc    0x30385056
+#include "vpx/vpx_encoder.h"
 
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
+#include "./tools_common.h"
+#include "./video_writer.h"
 
-static void mem_put_le16(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
 }
 
-static void mem_put_le32(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
-    mem[2] = val>>16;
-    mem[3] = val>>24;
+static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
+                        vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_roi_map_t roi = {0};
+
+  roi.rows = cfg->g_h / 16;
+  roi.cols = cfg->g_w / 16;
+
+  roi.delta_q[0] = 0;
+  roi.delta_q[1] = -2;
+  roi.delta_q[2] = -4;
+  roi.delta_q[3] = -6;
+
+  roi.delta_lf[0] = 0;
+  roi.delta_lf[1] = 1;
+  roi.delta_lf[2] = 2;
+  roi.delta_lf[3] = 3;
+
+  roi.static_threshold[0] = 1500;
+  roi.static_threshold[1] = 1000;
+  roi.static_threshold[2] = 500;
+  roi.static_threshold[3] = 0;
+
+  roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols);
+  for (i = 0; i < roi.rows * roi.cols; ++i)
+    roi.roi_map[i] = i % 4;
+
+  if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi))
+    die_codec(codec, "Failed to set ROI map");
+
+  free(roi.roi_map);
 }
 
-static void die(const char *fmt, ...) {
-    va_list ap;
+static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
+                           vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_active_map_t map = {0};
 
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
+  map.rows = cfg->g_h / 16;
+  map.cols = cfg->g_w / 16;
+
+  map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  for (i = 0; i < map.rows * map.cols; ++i)
+    map.active_map[i] = i % 2;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+
+  free(map.active_map);
 }
 
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
+static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
+                             vpx_codec_ctx_t *codec) {
+  vpx_active_map_t map = {0};
 
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
+  map.rows = cfg->g_h / 16;
+  map.cols = cfg->g_w / 16;
+  map.active_map = NULL;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
 }
 
-static int read_frame(FILE *f, vpx_image_t *img) {
-    size_t nbytes, to_read;
-    int    res = 1;
+static void encode_frame(vpx_codec_ctx_t *codec,
+                         vpx_image_t *img,
+                         int frame_index,
+                         VpxVideoWriter *writer) {
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
+                                               VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
 
-    to_read = img->w*img->h*3/2;
-    nbytes = fread(img->planes[0], 1, to_read, f);
-    if(nbytes != to_read) {
-        res = 0;
-        if(nbytes > 0)
-            printf("Warning: Read partial frame. Check your width & height!\n");
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
     }
-    return res;
-}
-
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  int frame_cnt) {
-    char header[32];
-
-    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-        return;
-    header[0] = 'D';
-    header[1] = 'K';
-    header[2] = 'I';
-    header[3] = 'F';
-    mem_put_le16(header+4,  0);                   /* version */
-    mem_put_le16(header+6,  32);                  /* headersize */
-    mem_put_le32(header+8,  fourcc);              /* headersize */
-    mem_put_le16(header+12, cfg->g_w);            /* width */
-    mem_put_le16(header+14, cfg->g_h);            /* height */
-    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
-    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
-    mem_put_le32(header+24, frame_cnt);           /* length */
-    mem_put_le32(header+28, 0);                   /* unused */
-
-    (void) fwrite(header, 1, 32, outfile);
-}
-
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt)
-{
-    char             header[12];
-    vpx_codec_pts_t  pts;
-
-    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-        return;
-
-    pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
-    mem_put_le32(header+4, pts&0xFFFFFFFF);
-    mem_put_le32(header+8, pts >> 32);
-
-    (void) fwrite(header, 1, 12, outfile);
+  }
 }
 
 int main(int argc, char **argv) {
-    FILE                *infile, *outfile;
-    vpx_codec_ctx_t      codec;
-    vpx_codec_enc_cfg_t  cfg;
-    int                  frame_cnt = 0;
-    vpx_image_t          raw;
-    vpx_codec_err_t      res;
-    long                 width;
-    long                 height;
-    int                  frame_avail;
-    int                  got_data;
-    int                  flags = 0;
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  int frame_count = 0;
+  vpx_image_t raw = {0};
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 2;        // TODO(dkovalev) add command line argument
+  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
 
-    /* Open files */
-    if(argc!=5)
-        die("Usage: %s <width> <height> <infile> <outfile>\n", argv[0]);
-    width = strtol(argv[1], NULL, 0);
-    height = strtol(argv[2], NULL, 0);
-    if(width < 16 || width%2 || height <16 || height%2)
-        die("Invalid resolution: %ldx%ld", width, height);
-    if(!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 1))
-        die("Faile to allocate image", width, height);
-    if(!(outfile = fopen(argv[4], "wb")))
-        die("Failed to open %s for writing", argv[4]);
+  exec_name = argv[0];
 
-    printf("Using %s\n",vpx_codec_iface_name(interface));
+  if (argc != 5)
+    die("Invalid number of arguments");
 
-    /* Populate encoder configuration */
-    res = vpx_codec_enc_config_default(interface, &cfg, 0);
-    if(res) {
-        printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-        return EXIT_FAILURE;
+  encoder = get_vpx_encoder_by_name("vp8");  // only vp8 for now
+  if (!encoder)
+    die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->interface()));
+
+  res = vpx_codec_enc_config_default(encoder->interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  while (vpx_img_read(&raw, infile)) {
+    ++frame_count;
+
+    if (frame_count == 22) {
+      set_roi_map(&cfg, &codec);
+    } else if (frame_count == 33) {
+      set_active_map(&cfg, &codec);
+    } else if (frame_count == 44) {
+      unset_active_map(&cfg, &codec);
     }
 
-    /* Update the default configuration with our settings */
-    cfg.rc_target_bitrate = width * height * cfg.rc_target_bitrate
-                            / cfg.g_w / cfg.g_h;
-    cfg.g_w = width;
-    cfg.g_h = height;
+    encode_frame(&codec, &raw, frame_count, writer);
+  }
+  encode_frame(&codec, NULL, -1, writer);
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
 
-    write_ivf_file_header(outfile, &cfg, 0);
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
 
+  vpx_video_writer_close(writer);
 
-        /* Open input file for this encoding pass */
-        if(!(infile = fopen(argv[3], "rb")))
-            die("Failed to open %s for reading", argv[3]);
-
-        /* Initialize codec */
-        if(vpx_codec_enc_init(&codec, interface, &cfg, 0))
-            die_codec(&codec, "Failed to initialize encoder");
-
-        frame_avail = 1;
-        got_data = 0;
-        while(frame_avail || got_data) {
-            vpx_codec_iter_t iter = NULL;
-            const vpx_codec_cx_pkt_t *pkt;
-
-            if(frame_cnt + 1 == 22) {
-                vpx_roi_map_t  roi;
-                unsigned int   i;
-
-                roi.rows = cfg.g_h/16;
-                roi.cols = cfg.g_w/16;
-
-                roi.delta_q[0] = 0;
-                roi.delta_q[1] = -2;
-                roi.delta_q[2] = -4;
-                roi.delta_q[3] = -6;
-
-                roi.delta_lf[0] = 0;
-                roi.delta_lf[1] = 1;
-                roi.delta_lf[2] = 2;
-                roi.delta_lf[3] = 3;
-
-                roi.static_threshold[0] = 1500;
-                roi.static_threshold[1] = 1000;
-                roi.static_threshold[2] =  500;
-                roi.static_threshold[3] =    0;
-
-                /* generate an ROI map for example */
-                roi.roi_map = malloc(roi.rows * roi.cols);
-                for(i=0;i<roi.rows*roi.cols;i++)
-                    roi.roi_map[i] = i & 3;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
-                    die_codec(&codec, "Failed to set ROI map");
-
-                free(roi.roi_map);
-            } else if(frame_cnt + 1 == 33) {
-                vpx_active_map_t  active;
-                unsigned int      i;
-
-                active.rows = cfg.g_h/16;
-                active.cols = cfg.g_w/16;
-
-                /* generate active map for example */
-                active.active_map = malloc(active.rows * active.cols);
-                for(i=0;i<active.rows*active.cols;i++)
-                    active.active_map[i] = i & 1;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ACTIVEMAP, &active))
-                    die_codec(&codec, "Failed to set active map");
-
-                free(active.active_map);
-            } else if(frame_cnt + 1 == 44) {
-                vpx_active_map_t  active;
-
-                active.rows = cfg.g_h/16;
-                active.cols = cfg.g_w/16;
-
-                /* pass in null map to disable active_map*/
-                active.active_map = NULL;
-
-                if(vpx_codec_control(&codec, VP8E_SET_ACTIVEMAP, &active))
-                    die_codec(&codec, "Failed to set active map");
-            }
-            frame_avail = read_frame(infile, &raw);
-            if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
-                                1, flags, VPX_DL_REALTIME))
-                die_codec(&codec, "Failed to encode frame");
-            got_data = 0;
-            while( (pkt = vpx_codec_get_cx_data(&codec, &iter)) ) {
-                got_data = 1;
-                switch(pkt->kind) {
-                case VPX_CODEC_CX_FRAME_PKT:
-                    write_ivf_frame_header(outfile, pkt);
-                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                                  outfile);
-                    break;
-                default:
-                    break;
-                }
-                printf(pkt->kind == VPX_CODEC_CX_FRAME_PKT
-                       && (pkt->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
-                fflush(stdout);
-            }
-            frame_cnt++;
-        }
-        printf("\n");
-        fclose(infile);
-
-    printf("Processed %d frames.\n",frame_cnt-1);
-    vpx_img_free(&raw);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    /* Try to rewrite the file header with the actual frame count */
-    if(!fseek(outfile, 0, SEEK_SET))
-        write_ivf_file_header(outfile, &cfg, frame_cnt-1);
-    fclose(outfile);
-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }

diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index 5a67578..f87dd35 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c

@@ -139,7 +139,7 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 

diff --git a/examples/vpx_temporal_scalable_patterns.c b/examples/vpx_temporal_scalable_patterns.c
index c40450e..32e88e3 100644
--- a/examples/vpx_temporal_scalable_patterns.c
+++ b/examples/vpx_temporal_scalable_patterns.c

@@ -12,6 +12,7 @@
 //  encoding scheme based on temporal scalability for video applications
 //  that benefit from a scalable bitstream.
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -31,6 +32,86 @@
 
 static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
 
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[VPX_TS_MAX_LAYERS];
+  // Total (cumulative) number of encoded frames per layer.
+  int layer_tot_enc_frames[VPX_TS_MAX_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[VPX_TS_MAX_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[VPX_TS_MAX_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[VPX_TS_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[VPX_TS_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative).
+  double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+};
+
+static void set_rate_control_metrics(struct RateControlMetrics *rc,
+                                     vpx_codec_enc_cfg_t *cfg) {
+  unsigned int i = 0;
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+  rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
+  rc->layer_pfb[0] = 1000.0 * cfg->ts_target_bitrate[0] /
+      rc->layer_framerate[0];
+  for (i = 0; i < cfg->ts_number_layers; ++i) {
+    if (i > 0) {
+      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
+      rc->layer_pfb[i] = 1000.0 *
+          (cfg->ts_target_bitrate[i] - cfg->ts_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+    }
+    rc->layer_input_frames[i] = 0;
+    rc->layer_enc_frames[i] = 0;
+    rc->layer_tot_enc_frames[i] = 0;
+    rc->layer_encoding_bitrate[i] = 0.0;
+    rc->layer_avg_frame_size[i] = 0.0;
+    rc->layer_avg_rate_mismatch[i] = 0.0;
+  }
+}
+
+static void printout_rate_control_summary(struct RateControlMetrics *rc,
+                                          vpx_codec_enc_cfg_t *cfg,
+                                          int frame_cnt) {
+  unsigned int i = 0;
+  int check_num_frames = 0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt -1);
+  printf("Rate control layer stats for %d layer(s):\n\n",
+      cfg->ts_number_layers);
+  for (i = 0; i < cfg->ts_number_layers; ++i) {
+    const int num_dropped = (i > 0) ?
+        (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) :
+        (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1);
+    rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] *
+        rc->layer_encoding_bitrate[i] / rc->layer_tot_enc_frames[i];
+    rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] /
+        rc->layer_enc_frames[i];
+    rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
+        rc->layer_enc_frames[i];
+    printf("For layer#: %d \n", i);
+    printf("Bitrate (target vs actual): %d %f \n", cfg->ts_target_bitrate[i],
+           rc->layer_encoding_bitrate[i]);
+    printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i],
+           rc->layer_avg_frame_size[i]);
+    printf("Average rate_mismatch: %f \n", rc->layer_avg_rate_mismatch[i]);
+    printf("Number of input frames, encoded (non-key) frames, "
+        "and perc dropped frames: %d %d %f \n", rc->layer_input_frames[i],
+        rc->layer_enc_frames[i],
+        100.0 * num_dropped / rc->layer_input_frames[i]);
+    check_num_frames += rc->layer_input_frames[i];
+    printf("\n");
+  }
+  if ((frame_cnt - 1) != check_num_frames)
+    die("Error: Number of input frames not equal to output! \n");
+}
+
 // Temporal scaling parameters:
 // NOTE: The 3 prediction frames cannot be used interchangeably due to
 // differences in the way they are handled throughout the code. The
@@ -351,24 +432,24 @@
   int frame_avail;
   int got_data;
   int flags = 0;
-  int i;
+  unsigned int i;
   int pts = 0;  // PTS starts at 0.
   int frame_duration = 1;  // 1 timebase tick per frame.
   int layering_mode = 0;
-  int frames_in_layer[VPX_TS_MAX_LAYERS] = {0};
   int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
   int flag_periodicity = 1;
   int max_intra_size_pct;
   vpx_svc_layer_id_t layer_id = {0, 0};
   const VpxInterface *encoder = NULL;
-  struct VpxInputContext input_ctx = {0};
+  FILE *infile = NULL;
+  struct RateControlMetrics rc;
 
   exec_name = argv[0];
   // Check usage and arguments.
-  if (argc < 10) {
+  if (argc < 11) {
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <mode> <Rate_0> ... <Rate_nlayers-1> \n",
-        argv[0]);
+        "<rate_num> <rate_den>  <frame_drop_threshold> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1> \n", argv[0]);
   }
 
   encoder = get_vpx_encoder_by_name(argv[3]);
@@ -383,12 +464,12 @@
     die("Invalid resolution: %d x %d", width, height);
   }
 
-  layering_mode = strtol(argv[8], NULL, 0);
-  if (layering_mode < 0 || layering_mode > 11) {
-    die("Invalid mode (0..11) %s", argv[8]);
+  layering_mode = strtol(argv[9], NULL, 0);
+  if (layering_mode < 0 || layering_mode > 12) {
+    die("Invalid mode (0..12) %s", argv[9]);
   }
 
-  if (argc != 9 + mode_to_num_layers[layering_mode]) {
+  if (argc != 10 + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
 
@@ -411,18 +492,18 @@
   cfg.g_timebase.num = strtol(argv[6], NULL, 0);
   cfg.g_timebase.den = strtol(argv[7], NULL, 0);
 
-  for (i = 9; i < 9 + mode_to_num_layers[layering_mode]; ++i) {
-    cfg.ts_target_bitrate[i - 9] = strtol(argv[i], NULL, 0);
+  for (i = 10; (int)i < 10 + mode_to_num_layers[layering_mode]; ++i) {
+    cfg.ts_target_bitrate[i - 10] = strtol(argv[i], NULL, 0);
   }
 
   // Real time parameters.
-  cfg.rc_dropframe_thresh = 0;
+  cfg.rc_dropframe_thresh = strtol(argv[8], NULL, 0);
   cfg.rc_end_usage = VPX_CBR;
   cfg.rc_resize_allowed = 0;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
-  cfg.rc_undershoot_pct = 100;
-  cfg.rc_overshoot_pct = 15;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
   cfg.rc_buf_initial_sz = 500;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
@@ -435,17 +516,19 @@
   // Disable automatic keyframe placement.
   cfg.kf_min_dist = cfg.kf_max_dist = 3000;
 
-  // Default setting for bitrate: used in special case of 1 layer (case 0).
-  cfg.rc_target_bitrate = cfg.ts_target_bitrate[0];
-
   set_temporal_layer_pattern(layering_mode,
                              &cfg,
                              layer_flags,
                              &flag_periodicity);
 
+  set_rate_control_metrics(&rc, &cfg);
+
+  // Target bandwidth for the whole stream.
+  // Set to ts_target_bitrate for highest layer (total bitrate).
+  cfg.rc_target_bitrate = cfg.ts_target_bitrate[cfg.ts_number_layers - 1];
+
   // Open input file.
-  input_ctx.filename = argv[1];
-  if (!(input_ctx.file = fopen(input_ctx.filename, "rb"))) {
+  if (!(infile = fopen(argv[1], "rb"))) {
     die("Failed to open %s for reading", argv[1]);
   }
 
@@ -494,9 +577,13 @@
     layer_id.spatial_layer_id = 0;
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
-    vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+    if (strncmp(encoder->name, "vp9", 3) == 0) {
+      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+    }
     flags = layer_flags[frame_cnt % flag_periodicity];
-    frame_avail = !read_yuv_frame(&input_ctx, &raw);
+    frame_avail = vpx_img_read(&raw, infile);
+    if (frame_avail)
+      ++rc.layer_input_frames[layer_id.temporal_layer_id];
     if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, pts, 1, flags,
         VPX_DL_REALTIME)) {
       die_codec(&codec, "Failed to encode frame");
@@ -514,7 +601,17 @@
               i < cfg.ts_number_layers; ++i) {
             vpx_video_writer_write_frame(outfile[i], pkt->data.frame.buf,
                                          pkt->data.frame.sz, pts);
-            ++frames_in_layer[i];
+            ++rc.layer_tot_enc_frames[i];
+            rc.layer_encoding_bitrate[i] += 8.0 * pkt->data.frame.sz;
+            // Keep count of rate control stats per layer (for non-key frames).
+            if (i == cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity] &&
+                !(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+              rc.layer_avg_frame_size[i] += 8.0 * pkt->data.frame.sz;
+              rc.layer_avg_rate_mismatch[i] +=
+                  fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[i]) /
+                  rc.layer_pfb[i];
+              ++rc.layer_enc_frames[i];
+            }
           }
           break;
           default:
@@ -524,8 +621,9 @@
     ++frame_cnt;
     pts += frame_duration;
   }
-  fclose(input_ctx.file);
-  printf("Processed %d frames: \n", frame_cnt - 1);
+  fclose(infile);
+  printout_rate_control_summary(&rc, &cfg, frame_cnt);
+
   if (vpx_codec_destroy(&codec))
     die_codec(&codec, "Failed to destroy codec");
 

diff --git a/test/codec_factory.h b/test/codec_factory.h
index 80e87c8..7f9398c 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h

@@ -24,6 +24,8 @@
 #include "test/encode_test_driver.h"
 namespace libvpx_test {
 
+const int kCodecFactoryParam = 0;
+
 class CodecFactory {
  public:
   CodecFactory() {}

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 5b0a548..5ae76d7 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -198,6 +198,7 @@
     last_pts_ = 0;
     bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
     frame_number_ = 0;
+    tot_frame_number_ = 0;
     first_drop_ = 0;
     num_drops_ = 0;
     // For testing up to 3 layers.
@@ -294,11 +295,22 @@
 
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    int layer = SetLayerId(frame_number_, cfg_.ts_number_layers);
-
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_)
+        first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
     // Add to the buffer the bits we'd expect from a constant bitrate server.
     bits_in_buffer_model_ += static_cast<int64_t>(
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
@@ -315,18 +327,10 @@
       bits_total_[i] += frame_size_in_bits;
     }
 
-    // If first drop not set and we have a drop set it to this time.
-    if (!first_drop_ && duration > 1)
-      first_drop_ = last_pts_ + 1;
-
-    // Update the number of frame drops.
-    if (duration > 1) {
-      num_drops_ += static_cast<int>(duration - 1);
-    }
-
     // Update the most recent pts.
     last_pts_ = pkt->data.frame.pts;
     ++frame_number_;
+    ++tot_frame_number_;
   }
 
   virtual void EndPassHook(void) {
@@ -342,7 +346,8 @@
 
   vpx_codec_pts_t last_pts_;
   double timebase_;
-  int frame_number_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
   int64_t bits_total_[3];
   double duration_;
   double effective_datarate_[3];
@@ -493,10 +498,7 @@
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
-  // TODO(marpan): For now keep frame dropper off. Need to investigate an
-  // issue (rate-mismatch) that occcurs at speed 3 and low bitrate (200k) when
-  // frame dropper is on.
-  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_dropframe_thresh = 1;
   cfg_.rc_min_quantizer = 0;
   cfg_.rc_max_quantizer = 63;
   cfg_.rc_end_usage = VPX_CBR;
@@ -529,8 +531,53 @@
     }
   }
 }
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9, BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+            "for layer: " << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+            "for layer: " << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 50% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 100);
+  }
+}
+
 VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
                           ::testing::Values(::libvpx_test::kOnePassGood),
-                          ::testing::Range(1, 5));
+                          ::testing::Range(2, 5));
 }  // namespace

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 44a0fc0..2734a45 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -76,6 +76,15 @@
     return detail ? detail : vpx_codec_error(&decoder_);
   }
 
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+    InitOnce();
+    return vpx_codec_set_frame_buffer_functions(
+        &decoder_, cb_get, cb_release, user_priv);
+  }
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const = 0;
 

diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
new file mode 100644
index 0000000..2e7adc1
--- /dev/null
+++ b/test/external_frame_buffer_test.cc

@@ -0,0 +1,466 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const int kVideoNameParam = 1;
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+  ExternalFrameBufferList()
+      : num_buffers_(0),
+        ext_fb_list_(NULL) {}
+
+  virtual ~ExternalFrameBufferList() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] ext_fb_list_[i].data;
+    }
+    delete [] ext_fb_list_;
+  }
+
+  // Creates the list to hold the external buffers. Returns true on success.
+  bool CreateBufferList(int num_buffers) {
+    if (num_buffers < 0)
+      return false;
+
+    num_buffers_ = num_buffers;
+    ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+    EXPECT_TRUE(ext_fb_list_ != NULL);
+    memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+    return true;
+  }
+
+  // Searches the frame buffer list for a free frame buffer. Makes sure
+  // that the frame buffer is at least |min_size| in bytes. Marks that the
+  // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+  // external frame buffer. Returns < 0 on an error.
+  int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = new uint8_t[min_size];
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Test function that will not allocate any data for the frame buffer.
+  // Returns < 0 on an error.
+  int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Marks the external frame buffer that |fb| is pointing too as free.
+  // Returns < 0 on an error.
+  int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer*>(fb->priv);
+    EXPECT_TRUE(ext_fb != NULL);
+    EXPECT_EQ(1, ext_fb->in_use);
+    ext_fb->in_use = 0;
+    return 0;
+  }
+
+  // Checks that the ximage data is contained within the external frame buffer
+  // private data passed back in the ximage.
+  void CheckXImageFrameBuffer(const vpx_image_t *img) {
+    if (img->fb_priv != NULL) {
+      const struct ExternalFrameBuffer *const ext_fb =
+          reinterpret_cast<ExternalFrameBuffer*>(img->fb_priv);
+
+      ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                  img->planes[0] < (ext_fb->data + ext_fb->size));
+    }
+  }
+
+ private:
+  // Returns the index of the first free frame buffer. Returns |num_buffers_|
+  // if there are no free frame buffers.
+  int FindFreeBufferIndex() {
+    int i;
+    // Find a free frame buffer.
+    for (i = 0; i < num_buffers_; ++i) {
+      if (!ext_fb_list_[i].in_use)
+        break;
+    }
+    return i;
+  }
+
+  // Sets |fb| to an external frame buffer. idx is the index into the frame
+  // buffer list.
+  void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) {
+    ASSERT_TRUE(fb != NULL);
+    fb->data = ext_fb_list_[idx].data;
+    fb->size = ext_fb_list_[idx].size;
+    ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+    ext_fb_list_[idx].in_use = 1;
+    fb->priv = &ext_fb_list_[idx];
+  }
+
+  int num_buffers_;
+  ExternalFrameBuffer *ext_fb_list_;
+};
+
+// Callback used by libvpx to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_vp9_frame_buffer(void *user_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libvpx to tell the application that |fb| is not needed
+// anymore.
+int release_vp9_frame_buffer(void *user_priv,
+                             vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_vp9_zero_frame_buffer(void *user_priv, size_t min_size,
+                              vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_vp9_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+                                       vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_vp9_frame_buffer(void *user_priv,
+                                    vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  (void)fb;
+  return 0;
+}
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferMD5Test
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
+        md5_file_(NULL),
+        num_buffers_(0) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libvpx use frame buffers we create.
+      ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+      ASSERT_EQ(VPX_CODEC_OK,
+                decoder->SetFrameBufferFunctions(
+                    GetVp9FrameBuffer, ReleaseVP9FrameBuffer, this));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  // Callback to get a free external frame buffer. Return value < 0 is an
+  // error.
+  static int GetVp9FrameBuffer(void *user_priv, size_t min_size,
+                               vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+  }
+
+  // Callback to release an external frame buffer. Return value < 0 is an
+  // error.
+  static int ReleaseVP9FrameBuffer(void *user_priv,
+                                   vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.ReturnFrameBuffer(fb);
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest()
+      : video_(NULL),
+        decoder_(NULL),
+        num_buffers_(0) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = {0};
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete decoder_;
+    delete video_;
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      int num_buffers,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+      EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+    }
+
+    return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames();
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      CheckDecodedFrames();
+    }
+    return VPX_CODEC_OK;
+  }
+
+ private:
+  void CheckDecodedFrames() {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      fb_list_.CheckXImageFrameBuffer(img);
+    }
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+  const int jitter_buffers = 4;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  set_num_buffers(num_buffers);
+
+#if CONFIG_VP8_DECODER
+  // Tell compiler we are not using kVP8TestVectors.
+  (void)libvpx_test::kVP8TestVectors;
+#endif
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else {
+    video = new libvpx_test::WebMVideoSource(filename);
+  }
+  ASSERT_TRUE(video != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS.
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+  const int jitter_buffers = 8;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS. Most files will
+  // only use 5 frame buffers at one time.
+  const int num_buffers = 2;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NoRelease) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    do_not_release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_zero_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_one_less_byte_frame_buffer,
+                release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, NULL,
+                                    release_vp9_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_ERROR,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+}
+
+VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
+                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors));
+}  // namespace

diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 3fbafbd..824a39d 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h

@@ -94,14 +94,14 @@
   virtual const uint8_t *cxdata() const {
     return end_of_file_ ? NULL : compressed_frame_buf_;
   }
-  virtual const unsigned int frame_size() const { return frame_sz_; }
-  virtual const unsigned int frame_number() const { return frame_; }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
 
  protected:
   std::string file_name_;
   FILE *input_file_;
   uint8_t *compressed_frame_buf_;
-  unsigned int frame_sz_;
+  size_t frame_sz_;
   unsigned int frame_;
   bool end_of_file_;
 };

diff --git a/test/test.mk b/test/test.mk
index 624a621..cabfc67 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -36,6 +36,7 @@
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 
 ## WebM Parsing
 NESTEGG_SRCS                           += ../nestegg/halloc/halloc.h

diff --git a/test/video_source.h b/test/video_source.h
index 3d01d39..6d1855a 100644
--- a/test/video_source.h
+++ b/test/video_source.h

@@ -184,9 +184,9 @@
 
   virtual const uint8_t *cxdata() const = 0;
 
-  virtual const unsigned int frame_size() const = 0;
+  virtual size_t frame_size() const = 0;
 
-  virtual const unsigned int frame_number() const = 0;
+  virtual unsigned int frame_number() const = 0;
 };
 
 }  // namespace libvpx_test

diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 53b0ba2..f21cf98 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h

@@ -169,8 +169,8 @@
   virtual const uint8_t *cxdata() const {
     return end_of_file_ ? NULL : buf_;
   }
-  virtual const unsigned int frame_size() const { return buf_sz_; }
-  virtual const unsigned int frame_number() const { return frame_; }
+  virtual size_t frame_size() const { return buf_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
 
  protected:
   std::string file_name_;

diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
index dc84c30..3991957 100644
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm

@@ -53,7 +53,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -101,7 +101,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -127,7 +127,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords

diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
index adc353d..915ee49 100644
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm

@@ -51,7 +51,7 @@
     orr     r8, r8, r10         ; differences of all 4 pixels
     ; calculate total sum
     add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; substract negative differences from sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r7, r8              ; byte (two pixels) to halfwords
@@ -77,7 +77,7 @@
 
     ; calculate total sum
     add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; substract negative differences from sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r7, r8              ; byte (two pixels) to halfwords

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
index dd2ce68..3668dc5 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm

@@ -58,7 +58,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -89,7 +89,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -120,7 +120,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
index f972d9b..b4e0959 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm

@@ -69,7 +69,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -111,7 +111,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -153,7 +153,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -195,7 +195,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index f5da9c0..10863e2 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm

@@ -59,7 +59,7 @@
     orr     r6, r6, r7          ; differences of all 4 pixels
     ; calculate total sum
     adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; substract negative differences from sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -90,7 +90,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -121,7 +121,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords
@@ -154,7 +154,7 @@
 
     ; calculate total sum
     add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; substract negative differences from sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
 
     ; calculate sse
     uxtb16  r5, r6              ; byte (two pixels) to halfwords

diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index f388d24..88a07b9 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm

@@ -527,7 +527,7 @@
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04
@@ -1289,7 +1289,7 @@
         pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
         pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
 
-        ; tranpose and write back
+        ; transpose and write back
         movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
         movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
 

diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index a66753b..1913abc 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm

@@ -958,7 +958,7 @@
         ; start work on filters
         B_FILTER 2
 
-        ; tranpose and write back - only work on q1, q0, p0, p1
+        ; transpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
         ; store 16-line result
 
@@ -1023,7 +1023,7 @@
         ; start work on filters
         B_FILTER 2
 
-        ; tranpose and write back - only work on q1, q0, p0, p1
+        ; transpose and write back - only work on q1, q0, p0, p1
         BV_TRANSPOSE
 
         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index afcda9f..98e5a71 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -2444,10 +2444,10 @@
         find_next_key_frame(cpi, &this_frame_copy);
 
         /* Special case: Error error_resilient_mode mode does not make much
-         * sense for two pass but with its current meaning but this code is
+         * sense for two pass but with its current meaning this code is
          * designed to stop outlandish behaviour if someone does set it when
          * using two pass. It effectively disables GF groups. This is
-         * temporary code till we decide what should really happen in this
+         * temporary code until we decide what should really happen in this
          * case.
          */
         if (cpi->oxcf.error_resilient_mode)
@@ -2773,7 +2773,7 @@
         kf_group_intra_err += this_frame->intra_error;
         kf_group_coded_err += this_frame->coded_error;
 
-        /* load a the next frame's stats */
+        /* Load the next frame's stats. */
         vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
         input_stats(cpi, this_frame);
 

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 07138ec..7f39646 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -2681,8 +2681,8 @@
     VP8_COMMON *cm = &cpi->common;
 
     /* Do we need to apply resampling for one pass cbr.
-     * In one pass this is more limited than in two pass cbr
-     * The test and any change is only made one per key frame sequence
+     * In one pass this is more limited than in two pass cbr.
+     * The test and any change is only made once per key frame sequence.
      */
     if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
     {
@@ -2705,7 +2705,7 @@
             cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
         }
 
-        /* Get the new hieght and width */
+        /* Get the new height and width */
         Scale2Ratio(cm->horiz_scale, &hr, &hs);
         Scale2Ratio(cm->vert_scale, &vr, &vs);
         new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 871b8d3..0b4c4cb 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -929,6 +929,7 @@
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NOT_IMPLEMENTED,
     },
     { /* encoder functions */
         NOT_IMPLEMENTED,

diff --git a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
index 388a7d7..72e933e 100644
--- a/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm

@@ -72,7 +72,7 @@
     ;   reg1 = output[first_offset]
     ;   reg2 = output[second_offset]
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -88,7 +88,7 @@
     ;   output[first_offset] = reg1
     ;   output[second_offset] = reg2
     ;   for proper address calculation, the last offset used when manipulating
-    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   output, whether reading or storing) must be passed in. use 0 for first
     ;   use.
     MACRO
     STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -242,7 +242,7 @@
     ; TODO(cd): have special case to re-use constants when they are similar for
     ;           consecutive butterflies
     ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/substractions before the multiplies.
+    ;           additions/subtractions before the multiplies.
     ; generate the constants
     ;   generate scalar constants
     mov             r8,  #$first_constant  & 0xFF00
@@ -260,7 +260,7 @@
     vmull.s16 q11, $regB, d31
     vmull.s16 q12, $regC, d31
     ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/substractions (to get back two register)
+    ; do some addition/subtractions (to get back two register)
     vsub.s32  q8, q8, q10
     vsub.s32  q9, q9, q11
     ; do more multiplications (ordered for maximum latency hiding)
@@ -268,7 +268,7 @@
     vmull.s16 q11, $regA, d30
     vmull.s16 q15, $regB, d30
     ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/substractions
+    ; do more addition/subtractions
     vadd.s32  q11, q12, q11
     vadd.s32  q10, q10, q15
     ; (used) four for intermediate (q8-q11)

diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
index 8cb913c..5fe2bba 100644
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vp9/common/arm/neon/vp9_mb_lpf_neon.asm

@@ -439,6 +439,9 @@
     tst         r7, #1
     bxne        lr
 
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
     ; mbfilter flat && mask branch
     ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
     ; and using vibt on the q's?

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 4a49964..dc9856f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm

@@ -315,8 +315,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
 
@@ -327,8 +327,8 @@
     vdup.u16            q2, r2
     vadd.s16            q1, q1, q3
     vadd.s16            q2, q2, q3
-    vqshrun.s16         d0, q1, #0
-    vqshrun.s16         d1, q2, #0
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
@@ -372,10 +372,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -394,10 +394,10 @@
     vadd.s16            q8, q3, q8
     vadd.s16            q9, q3, q9
 
-    vqshrun.s16         d0, q0, #0
-    vqshrun.s16         d1, q1, #0
-    vqshrun.s16         d2, q8, #0
-    vqshrun.s16         d3, q9, #0
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
 
     vst1.64             {d0}, [r0], r1
     vst1.64             {d1}, [r0], r1
@@ -445,10 +445,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d20[2]                  ; proload next 2 rows data
     vdup.16             q8, d20[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -459,10 +459,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[0]                  ; proload next 2 rows data
     vdup.16             q8, d21[1]
     vst1.64             {d2,d3}, [r0], r1
@@ -472,10 +472,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vdup.16             q0, d21[2]                  ; proload next 2 rows data
     vdup.16             q8, d21[3]
     vst1.64             {d2,d3}, [r0], r1
@@ -486,12 +486,10 @@
     vadd.s16            q0, q0, q3
     vadd.s16            q11, q8, q2
     vadd.s16            q8, q8, q3
-    vqshrun.s16         d2, q1, #0
-    vqshrun.s16         d3, q0, #0
-    vqshrun.s16         d22, q11, #0
-    vqshrun.s16         d23, q8, #0
-    vdup.16             q0, d20[2]
-    vdup.16             q8, d20[3]
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
     vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
     vmovl.u8            q10, d18
     vst1.64             {d2,d3}, [r0], r1
@@ -544,19 +542,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q1, d6[2]
     vdup.16             q2, d6[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -566,19 +564,19 @@
     vadd.s16            q13, q1, q9
     vadd.s16            q14, q1, q10
     vadd.s16            q15, q1, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[0]
     vdup.16             q2, d7[1]
     vst1.64             {d24-d27}, [r0], r1
@@ -588,19 +586,19 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vdup.16             q0, d7[2]
     vdup.16             q2, d7[3]
     vst1.64             {d24-d27}, [r0], r1
@@ -610,20 +608,20 @@
     vadd.s16            q13, q0, q9
     vadd.s16            q14, q0, q10
     vadd.s16            q15, q0, q11
-    vqshrun.s16         d0, q12, #0
-    vqshrun.s16         d1, q13, #0
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
     vadd.s16            q12, q2, q8
     vadd.s16            q13, q2, q9
-    vqshrun.s16         d2, q14, #0
-    vqshrun.s16         d3, q15, #0
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
     vadd.s16            q14, q2, q10
     vadd.s16            q15, q2, q11
     vst1.64             {d0-d3}, [r0], r1
-    vqshrun.s16         d24, q12, #0
-    vqshrun.s16         d25, q13, #0
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
     vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
-    vqshrun.s16         d26, q14, #0
-    vqshrun.s16         d27, q15, #0
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
     vmovl.u8            q3, d0
     vst1.64             {d24-d27}, [r0], r1
 

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c22c621..6086323 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -128,7 +128,7 @@
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
-  unsigned char skip_coeff;    // 0=need to decode coeffs, 1=no coefficients
+  unsigned char skip;    // 0=need to decode coeffs, 1=no coefficients
   unsigned char segment_id;    // Segment id for this block.
 
   // Flags used for prediction status of various bit-stream signals

diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 355ac1a..24c785f 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c

@@ -58,7 +58,7 @@
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
+  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 13e954e..bc12f9a 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = {
+const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
   4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,11 +85,11 @@
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = {
+const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]) = {
+const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index d6b380f..aab8b53 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -42,7 +42,7 @@
 
 #define ENTROPY_NODES 11
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 
 #define EOB_MODEL_TOKEN 3
 extern const vp9_tree_index vp9_coefmodel_tree[];
@@ -116,8 +116,8 @@
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
-extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
 
 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 25cba7f..8921539 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -465,8 +465,10 @@
     cm->frame_contexts[cm->frame_context_idx] = cm->fc;
   }
 
-  vpx_memset(cm->prev_mip, 0,
-             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+  if (frame_is_intra_only(cm))
+    vpx_memset(cm->prev_mip, 0,
+               cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+
   vpx_memset(cm->mip, 0,
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 

diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 546f603..7474a88 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c

@@ -10,12 +10,9 @@
 
 #include <assert.h>
 
-#include "vpx_ports/mem.h"
-
 #include "vp9/common/vp9_filter.h"
 
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -35,8 +32,7 @@
 };
 
 // Lagrangian interpolation filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = {
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -56,8 +52,7 @@
 };
 
 // DCT based filter
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -77,8 +72,7 @@
 };
 
 // freqmultiplier = 0.5
-DECLARE_ALIGNED(256, const InterpKernel,
-                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = {
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},

diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 15610d7..29d3867 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h

@@ -13,6 +13,8 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,10 +39,14 @@
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
 
-extern const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
-extern const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]);
+DECLARE_ALIGNED(256, extern const InterpKernel,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]);
 
 // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
 // filter kernel as a 2 tap filter.

diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index d903ed6..dffeb8a 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c

@@ -42,7 +42,7 @@
   int i;
   InternalFrameBufferList *const int_fb_list =
       (InternalFrameBufferList *)cb_priv;
-  if (int_fb_list == NULL || fb == NULL)
+  if (int_fb_list == NULL)
     return -1;
 
   // Find a free frame buffer.
@@ -73,12 +73,8 @@
 }
 
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
-  InternalFrameBuffer *int_fb;
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
-  if (fb == NULL)
-    return -1;
-
-  int_fb = (InternalFrameBuffer *)fb->priv;
   int_fb->in_use = 0;
   return 0;
 }

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 07d7a92..04f8934 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -496,7 +496,7 @@
   const BLOCK_SIZE block_size = mi->mbmi.sb_type;
   const TX_SIZE tx_size_y = mi->mbmi.tx_size;
   const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
-  const int skip = mi->mbmi.skip_coeff;
+  const int skip = mi->mbmi.skip;
   const int seg = mi->mbmi.segment_id;
   const int ref = mi->mbmi.ref_frame[0];
   const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
@@ -577,7 +577,7 @@
                          LOOP_FILTER_MASK *lfm) {
   const BLOCK_SIZE block_size = mi->mbmi.sb_type;
   const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const int skip = mi->mbmi.skip_coeff;
+  const int skip = mi->mbmi.skip;
   const int seg = mi->mbmi.segment_id;
   const int ref = mi->mbmi.ref_frame[0];
   const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
@@ -937,8 +937,7 @@
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
       const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip_coeff
-                            && is_inter_block(&mi[0].mbmi);
+      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
           !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index ff02622..3b0d1e9 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -196,7 +196,8 @@
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+  const MB_MODE_INFO *const prev_mbmi = cm->coding_use_prev_mi && prev_mi ?
+      &prev_mi->mbmi : NULL;
   int different_ref_found = 0;
   int context_counter = 0;
 

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 97983c5..e6d6ea7 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -224,6 +224,11 @@
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
+  // Flag indicates if prev_mi can be used in coding:
+  //   0: encoder assumes decoder does not have prev_mi
+  //   1: encoder assumes decoder has and uses prev_mi
+  unsigned int coding_use_prev_mi;
+
   int log2_tile_cols, log2_tile_rows;
 
   // Private data associated with the frame buffer callbacks.
@@ -302,7 +307,6 @@
 static void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode &&
                                        !cm->intra_only &&
                                        cm->last_show_frame;
   // Special case: set prev_mi to NULL when the previous mode info

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index a172ba6..7baa9ee 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -700,7 +700,7 @@
         char zz[4];
         int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                         mi[mb_index].mbmi.mode != SPLITMV &&
-                        mi[mb_index].mbmi.skip_coeff);
+                        mi[mb_index].mbmi.skip);
 
         if (cm->frame_type == KEY_FRAME)
           snprintf(zz, sizeof(zz) - 1, "a");

diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 487f00c..197bcb6 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c

@@ -353,10 +353,10 @@
   const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
   const int has_above = above_mbmi != NULL;
   const int has_left = left_mbmi != NULL;
-  int above_ctx = (has_above && !above_mbmi->skip_coeff) ? above_mbmi->tx_size
-                                                         : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip_coeff) ? left_mbmi->tx_size
-                                                      : max_tx_size;
+  int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size
+                                                   : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size
+                                                : max_tx_size;
   if (!has_left)
     left_ctx = above_ctx;
 

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 33ae5a8..6c7a0d3 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -39,7 +39,7 @@
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
@@ -47,8 +47,8 @@
 static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = get_above_mi(xd);
   const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip_coeff : 0;
-  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
   return above_skip + left_skip;
 }
 

diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index f9bc06e..a1befc6 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c

@@ -10,7 +10,7 @@
 
 #include "vp9/common/vp9_prob.h"
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
+const uint8_t vp9_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 7576e7b..df603ad 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -139,9 +139,6 @@
   return clamped_mv;
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
 static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                    int bw, int bh,
                                    int x, int y, int w, int h,

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 878c751..a18ae9b 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -264,13 +264,13 @@
 specialize vp9_convolve_avg $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 sse2 ssse3 neon dspr2
+specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
+specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
+specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
@@ -386,7 +386,7 @@
 specialize vp9_variance4x4 mmx $sse2_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
@@ -416,7 +416,7 @@
 specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
 
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc

diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 60018ea..1b4904c 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -139,14 +139,79 @@
                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
   } \
 }
+#if HAVE_AVX2
+filter8_1dfunction vp9_filter_block1d16_v8_avx2;
+filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif
 #if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index a7f6930..91055b9f 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm

@@ -527,7 +527,7 @@
         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
         ; mm7 = q1
 
-        ; tranpose and write back
+        ; transpose and write back
         ; mm1 =    72 62 52 42 32 22 12 02
         ; mm6 =    73 63 53 43 33 23 13 03
         ; mm3 =    74 64 54 44 34 24 14 04

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000..efa960c
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

@@ -0,0 +1,545 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pixels_per_line,
+                                  unsigned char *output_ptr,
+                                  unsigned int  output_pitch,
+                                  unsigned int  output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+  filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+  filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i-=2) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
+    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line-3)), 1);
+
+    // filter the source buffer
+    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
+    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+                 _mm_loadu_si128((__m128i *)
+                 (src_ptr+src_pixels_per_line+5)), 1);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+                                           srcRegFilt32b2_1);
+
+    src_ptr+=src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr,
+    _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr+=dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(secondFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+  }
+}
+
+void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
+                                  unsigned int src_pitch,
+                                  unsigned char *output_ptr,
+                                  unsigned int out_pitch,
+                                  unsigned int output_height,
+                                  int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  unsigned int src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+#if defined (__GNUC__)
+#if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
+(__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
+  filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
+#elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
+  filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+#else
+  filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+#endif
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr)));
+  srcReg32b2 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+  srcReg32b3 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+  srcReg32b4 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+  srcReg32b5 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+  srcReg32b6 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+  srcReg32b7 = _mm256_castsi128_si256(
+               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+               _mm256_castsi256_si128(srcReg32b2), 1);
+  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+               _mm256_castsi256_si128(srcReg32b3), 1);
+  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+               _mm256_castsi256_si128(srcReg32b4), 1);
+  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+               _mm256_castsi256_si128(srcReg32b5), 1);
+  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+               _mm256_castsi256_si128(srcReg32b6), 1);
+  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+               _mm256_castsi256_si128(srcReg32b7), 1);
+
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  // save
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+  for (i = output_height; i > 1; i-=2) {
+     // load the last 2 loads of 16 bytes and have every two
+     // consecutive loads in the same 256 bit register
+     srcReg32b8 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+     _mm256_castsi256_si128(srcReg32b8), 1);
+     srcReg32b9 = _mm256_castsi128_si256(
+     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+     _mm256_castsi256_si128(srcReg32b9), 1);
+
+     // merge every two consecutive registers
+     // save
+     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
+
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+     srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b6, srcReg32b13));
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b6, srcReg32b13));
+
+
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+     // shift by 7 bit each 16 bit
+     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+     // shrink to 8 bit each 16 bits, the first lane contain the first
+     // convolve result and the second lane contain the second convolve
+     // result
+     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+     src_ptr+=src_stride;
+
+     // save 16 bytes
+     _mm_store_si128((__m128i*)output_ptr,
+     _mm256_castsi256_si128(srcReg32b1));
+
+     // save the next 16 bits
+     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+     _mm256_extractf128_si256(srcReg32b1, 1));
+
+     output_ptr+=dst_stride;
+
+     // save part of the registers for next strides
+     srcReg32b10 = srcReg32b11;
+     srcReg32b1 = srcReg32b3;
+     srcReg32b11 = srcReg32b2;
+     srcReg32b3 = srcReg32b5;
+     srcReg32b2 = srcReg32b4;
+     srcReg32b5 = srcReg32b7;
+     srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the last 2 results together
+    srcRegFilt4 = _mm_unpacklo_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 = _mm_unpackhi_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                  _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+  }
+}

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..cf28d8d
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

@@ -0,0 +1,490 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+  forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+    srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+    // filter the source buffer
+    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes.
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 8 bytes
+    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+    // load the next 8 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  for (i = 0; i < output_height; i++) {
+    // load the first 16 bytes
+    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+    // load the next 16 bytes in stride of src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+    // merge the result together
+    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+    // load the next 16 bytes in stride of two/three src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+    // merge the result together
+    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+    // load the next 16 bytes in stride of four/five src_pitch
+    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+    // merge the result together
+    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+    // add and saturate the results together
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index d37afa5..791d0f2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -303,7 +303,7 @@
                           dst, pd->dst.stride, dst, pd->dst.stride,
                           x, y, plane);
 
-  if (!mi->mbmi.skip_coeff) {
+  if (!mi->mbmi.skip) {
     const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
                                             plane_bsize, x, y, tx_size,
                                             args->r);
@@ -350,9 +350,9 @@
 
   xd->mi_8x8 = cm->mi_grid_visible + offset;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
-  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
+
+  xd->last_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+      xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
@@ -397,7 +397,7 @@
   // Has to be called after set_offsets
   mbmi = &xd->mi_8x8[0]->mbmi;
 
-  if (mbmi->skip_coeff) {
+  if (mbmi->skip) {
     reset_skip_context(xd, bsize);
   } else {
     if (cm->seg.enabled)
@@ -421,12 +421,12 @@
     vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
     // Reconstruction
-    if (!mbmi->skip_coeff) {
+    if (!mbmi->skip) {
       int eobtotal = 0;
       struct inter_args arg = { cm, xd, r, &eobtotal };
       vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
-        mbmi->skip_coeff = 1;  // skip loopfilter
+        mbmi->skip = 1;  // skip loopfilter
     }
   }
 
@@ -1203,9 +1203,11 @@
   }
 
   if (!cm->error_resilient_mode) {
+    cm->coding_use_prev_mi = 1;
     cm->refresh_frame_context = vp9_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
+    cm->coding_use_prev_mi = 0;
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
   }
@@ -1373,7 +1375,10 @@
   alloc_tile_storage(pbi, tile_rows, tile_cols);
 
   xd->mode_info_stride = cm->mode_info_stride;
-  set_prev_mi(cm);
+  if (cm->coding_use_prev_mi)
+    set_prev_mi(cm);
+  else
+    cm->prev_mi = NULL;
 
   setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c7fb71d..4de8db3 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -146,8 +146,8 @@
   return segment_id;
 }
 
-static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                           int segment_id, vp9_reader *r) {
+static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                     int segment_id, vp9_reader *r) {
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
@@ -169,7 +169,7 @@
   const BLOCK_SIZE bsize = mbmi->sb_type;
 
   mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
@@ -356,6 +356,11 @@
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP &&
+         mv->col > MV_LOW && mv->col < MV_UPP;
+}
+
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
@@ -367,14 +372,10 @@
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
-              &cm->fc.nmvc, mv_counts, allow_hp);
-      if (is_compound)
-        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
-                &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
-        ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
-        ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+                allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
       }
       break;
     }
@@ -520,10 +521,10 @@
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
-                               !mbmi->skip_coeff || !inter_block, r);
+                               !mbmi->skip || !inter_block, r);
 
   if (inter_block)
     read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c8f334f..31ec069 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -34,7 +34,6 @@
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 #ifdef ENTROPY_STATS
-vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
 extern unsigned int active_section;
 #endif
 
@@ -104,13 +103,13 @@
   }
 }
 
-static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
-                            vp9_writer *w) {
+static int write_skip(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+                      vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = m->mbmi.skip_coeff;
+    const int skip = m->mbmi.skip;
     vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
     return skip;
   }
@@ -247,15 +246,15 @@
   const nmv_context *nmvc = &cm->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &cm->seg;
+  const struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
-  const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1];
+  const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
+  const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
-  int skip_coeff;
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
+  int skip;
 
 #ifdef ENTROPY_STATS
   active_section = 9;
@@ -273,18 +272,18 @@
     }
   }
 
-  skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
+  skip = write_skip(cpi, segment_id, m, bc);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, rf != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
+    vp9_write(bc, ref0 != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(rf != INTRA_FRAME &&
-        (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+      !(ref0 != INTRA_FRAME &&
+        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
     write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
   }
 
-  if (rf == INTRA_FRAME) {
+  if (ref0 == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
@@ -306,7 +305,7 @@
   } else {
     vp9_prob *mv_ref_p;
     encode_ref_frame(cpi, bc);
-    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
+    mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
 
 #ifdef ENTROPY_STATS
     active_section = 3;
@@ -316,7 +315,7 @@
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[rf]][INTER_OFFSET(mode)];
+        ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(mode)];
       }
     }
 
@@ -336,21 +335,19 @@
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_inter_mode(bc, blockmode, mv_ref_p);
-          ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [INTER_OFFSET(blockmode)];
-
-          if (blockmode == NEWMV) {
+          const MB_PREDICTION_MODE b_mode = m->bmi[j].as_mode;
+          write_inter_mode(bc, b_mode, mv_ref_p);
+          ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(b_mode)];
+          if (b_mode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
             vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                          &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
             if (has_second_ref(mi))
               vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                            &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -359,11 +356,11 @@
       active_section = 5;
 #endif
       vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
+                    &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
 
       if (has_second_ref(mi))
         vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
+                      &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
     }
   }
 }
@@ -382,7 +379,7 @@
   if (seg->update_map)
     write_segment_id(bc, seg, m->mbmi.segment_id);
 
-  write_skip_coeff(cpi, segment_id, m, bc);
+  write_skip(cpi, segment_id, m, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
     write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
@@ -550,16 +547,6 @@
             coef_probs[i][j][k][l][m] = get_binary_prob(
                                             coef_branch_ct[i][j][k][l][m][0],
                                             coef_branch_ct[i][j][k][l][m][1]);
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing) {
-            int t;
-            for (t = 0; t < ENTROPY_TOKENS; ++t)
-              context_counters[tx_size][i][j][k][l][t] +=
-                  coef_counts[i][j][k][l][t];
-            context_counters[tx_size][i][j][k][l][ENTROPY_TOKENS] +=
-                eob_branch_ct[i][j][k][l];
-          }
-#endif
         }
       }
     }
@@ -638,10 +625,6 @@
                 if (s > 0 && newp != *oldp)
                   u = 1;
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -693,10 +676,6 @@
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
-#ifdef ENTROPY_STATS
-                  if (!cpi->dummy_packing)
-                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                   continue;
                 }
                 if (u == 1 && updates == 1) {
@@ -707,10 +686,6 @@
                     vp9_write(bc, 0, upd);
                 }
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -1282,11 +1257,12 @@
     active_section = 7;
 #endif
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
-  vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index a9d168c..d523239 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -47,7 +47,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -315,7 +315,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5432c1f..4ad90c6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -365,8 +365,6 @@
                             (bw * bh);
 
     if (projected_rate < (target_rate / 4)) {
-      segment = 2;
-    } else if (projected_rate < (target_rate / 2)) {
       segment = 1;
     } else {
       segment = 0;
@@ -400,7 +398,6 @@
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
 
-  const int mb_mode_index = ctx->best_mode_index;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -472,8 +469,8 @@
       cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC        /*DC_PRED*/,
       THR_V_PRED    /*V_PRED*/,
@@ -486,12 +483,13 @@
       THR_D63_PRED  /*D63_PRED*/,
       THR_TM        /*TM_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]++;
-#endif
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
         int_mv best_mv[2];
@@ -555,8 +553,6 @@
   xd->mi_8x8 = cm->mi_grid_visible + idx_str;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
   xd->mi_8x8[0] = cm->mi + idx_str;
@@ -631,7 +627,7 @@
   int orig_rdmult = x->rdmult;
   double rdmult_ratio;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
@@ -660,14 +656,25 @@
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
-  xd->mi_8x8[0]->mbmi.skip_coeff = 0;
+  xd->mi_8x8[0]->mbmi.skip = 0;
 
   x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
                                             : vp9_block_energy(cpi, x, bsize);
-    xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+
+    if (cm->frame_type == KEY_FRAME ||
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      xd->mi_8x8[0]->mbmi.segment_id =
+        vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
     vp9_mb_init_quantizer(cpi, x);
   }
@@ -676,16 +683,17 @@
     activity_masking(cpi, x);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    vp9_clear_system_state();  // __asm emms;
-    x->rdmult = round(x->rdmult * rdmult_ratio);
+    vp9_clear_system_state();
+    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     const int mi_offset = mi_row * cm->mi_cols + mi_col;
     unsigned char complexity = cpi->complexity_map[mi_offset];
-    const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
-                        (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+    const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
+                        (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
 
-    if (!is_edge && (complexity > 128))
+    if (!is_edge && (complexity > 128)) {
       x->rdmult = x->rdmult  + ((x->rdmult * (complexity - 128)) / 256);
+    }
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -705,10 +713,13 @@
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     x->rdmult = orig_rdmult;
     if (*totalrate != INT_MAX) {
-      vp9_clear_system_state();  // __asm emms;
-      *totalrate = round(*totalrate * rdmult_ratio);
+      vp9_clear_system_state();
+      *totalrate = (int)round(*totalrate * rdmult_ratio);
     }
   }
+  else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = orig_rdmult;
+  }
 }
 
 static void update_stats(VP9_COMP *cpi) {
@@ -1029,38 +1040,19 @@
   }
   return 0;
 }
+
 static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE bsize, int output_enabled) {
   int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = x->plane;
-  struct macroblockd_plane *const pd = xd->plane;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-  const int mb_mode_index = ctx->best_mode_index;
-  int max_plane;
-
-  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
-  for (i = 0; i < max_plane; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
-
-  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][2];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
-    p[i].eobs = ctx->eobs_pbuf[i][2];
-  }
-
   x->skip = ctx->skip;
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
       THR_DC /*DC_PRED*/,
       THR_V_PRED /*V_PRED*/,
@@ -1074,10 +1066,12 @@
       THR_TM /*TM_PRED*/,
     };
     ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
-#endif
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
         int_mv best_mv[2];
@@ -1116,8 +1110,8 @@
 }
 
 static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize) {
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
@@ -1135,7 +1129,6 @@
     ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
-
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -2087,7 +2080,7 @@
 
   for (y = 0; y < ymbs; y++) {
     for (x = 0; x < xmbs; x++) {
-      if (!mi_8x8[y * mis + x]->mbmi.skip_coeff)
+      if (!mi_8x8[y * mis + x]->mbmi.skip)
         return 0;
     }
   }
@@ -2246,7 +2239,7 @@
   mbmi->ref_frame[1] = INTRA_FRAME;
   mbmi->tx_size = max_txsize_lookup[bsize];
   mbmi->uv_mode = mode;
-  mbmi->skip_coeff = 0;
+  mbmi->skip = 0;
   mbmi->sb_type = bsize;
   mbmi->segment_id = 0;
 }
@@ -2260,68 +2253,44 @@
 }
 
 static void rtc_use_partition(VP9_COMP *cpi,
-                             const TileInfo *const tile,
-                             MODE_INFO **mi_8x8,
-                             TOKENEXTRA **tp, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon) {
+                              const TileInfo *const tile,
+                              MODE_INFO **mi_8x8,
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                              int do_recon) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int mis = cm->mode_info_stride;
-  int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size];
-  int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size];
   int i, j;
   int chosen_rate = INT_MAX;
   int64_t chosen_dist = INT_MAX;
   MB_PREDICTION_MODE mode = DC_PRED;
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int b32i;
-  for (b32i = 0; b32i < 4; b32i++) {
-    int b16i;
-    for (b16i = 0; b16i < 4; b16i++) {
-      int b8i;
-      int block_row = get_block_row(b32i, b16i, 0);
-      int block_col = get_block_col(b32i, b16i, 0);
-      int index = block_row * mis + block_col;
-      int rate;
-      int64_t dist;
+  int row8x8_remaining = MIN(MI_BLOCK_SIZE, tile->mi_row_end - mi_row);
+  int col8x8_remaining = MIN(MI_BLOCK_SIZE, tile->mi_col_end - mi_col);
 
-      // Find a partition size that fits
-      bsize = find_partition_size(cpi->sf.always_this_block_size,
-                                  (row8x8_remaining - block_row),
-                                  (col8x8_remaining - block_col),
-                                  &mi_height, &mi_width);
-      mi_8x8[index] = mi_8x8[0] + index;
+  int rows = mi_row + row8x8_remaining;
+  int cols = mi_col + col8x8_remaining;
 
-      set_mi_row_col(xd, tile, mi_row + block_row, mi_height,
-                     mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols);
+  int brate;
+  int64_t bdist;
+  *rate = 0;
+  *dist = 0;
 
-      xd->mi_8x8 = mi_8x8 + index;
+  // find prediction mode for each 8x8 block
+  for (j = mi_row; j < rows; ++j) {
+    for (i = mi_col; i < cols; ++i) {
+      set_offsets(cpi, tile, j, i, BLOCK_8X8);
 
-      if (cm->frame_type != KEY_FRAME) {
-        set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize);
+      if (cm->frame_type != KEY_FRAME)
+        vp9_pick_inter_mode(cpi, x, tile, j, i, &brate, &bdist, BLOCK_8X8);
+      else
+        set_mode_info(&xd->mi_8x8[0]->mbmi, BLOCK_8X8, mode, j, i);
 
-        vp9_pick_inter_mode(cpi, x, tile,
-                            mi_row + block_row, mi_col + block_col,
-                            &rate, &dist, bsize);
-      } else {
-        set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
-                      mi_row + block_row, mi_col + block_col);
-      }
-
-      for (j = 0; j < mi_height; j++)
-        for (i = 0; i < mi_width; i++)
-          if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i
-            && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) {
-            mi_8x8[index+ i + j * mis] = mi_8x8[index];
-          }
-
-      for (b8i = 0; b8i < 4; b8i++) {
-      }
+      *rate += brate;
+      *dist += bdist;
     }
   }
+
   encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
 
   *rate = chosen_rate;
@@ -2345,10 +2314,8 @@
 
     const int idx_str = cm->mode_info_stride * mi_row + mi_col;
     MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-
     cpi->mb.source_variance = UINT_MAX;
-    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+
     rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                      &dummy_rate, &dummy_dist, 1);
   }
@@ -2406,6 +2373,22 @@
 
   set_prev_mi(cm);
 
+  if (cpi->sf.use_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &cpi->mb.sb64_context;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+  }
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -2425,7 +2408,7 @@
           // For each row of SBs in the frame
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += 8) {
+               mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
             if (cpi->sf.use_pick_mode)
               encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
             else
@@ -2689,6 +2672,7 @@
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
                    !cpi->sf.use_pick_mode;
@@ -2723,7 +2707,7 @@
 
   if (!is_inter_block(mbmi)) {
     int plane;
-    mbmi->skip_coeff = 1;
+    mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
@@ -2742,11 +2726,11 @@
   if (!is_inter_block(mbmi)) {
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else if (!x->skip) {
-    mbmi->skip_coeff = 1;
+    mbmi->skip = 1;
     vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
-    mbmi->skip_coeff = 1;
+    mbmi->skip = 1;
     if (output_enabled)
       cm->counts.skip[vp9_get_skip_context(xd)][1]++;
     reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
@@ -2756,7 +2740,7 @@
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) &&
-            (mbmi->skip_coeff ||
+            (mbmi->skip ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
                       &cm->counts.tx)[mbmi->tx_size];

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index ad28f31..19421aa 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -32,22 +32,22 @@
 struct encode_b_args {
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  unsigned char *skip_coeff;
+  unsigned char *skip;
 };
 
 void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff_ptr, ptrdiff_t diff_stride,
-                          const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+      diff[c] = src[c] - pred[c];
 
-    diff_ptr += diff_stride;
-    pred_ptr += pred_stride;
-    src_ptr  += src_stride;
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
   }
 }
 
@@ -131,9 +131,9 @@
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *qcoeff_ptr;
-  int16_t *dqcoeff_ptr;
+  const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int eob = p->eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
@@ -143,7 +143,6 @@
   PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
   const int default_eob = 16 << (tx_size << 1);
-
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int16_t *dequant_ptr = pd->dequant;
@@ -153,8 +152,6 @@
   const int16_t *nb = so->neighbors;
 
   assert((!type && !plane) || (type && plane));
-  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -172,13 +169,13 @@
   next = eob;
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
-        qcoeff_ptr[scan[i]]].token];
+        qcoeff[scan[i]]].token];
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
 
     rc = scan[i];
-    x = qcoeff_ptr[rc];
+    x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
       int shortcut = 0;
@@ -203,7 +200,7 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -216,8 +213,8 @@
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
                                          dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -306,16 +303,16 @@
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
-  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
+  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
     if (x) {
       final_eob = i;
     }
     rc = scan[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -348,6 +345,15 @@
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 }
+
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, int16_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_fdct32x32(src, dst, src_stride);
+}
+
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -367,10 +373,7 @@
   switch (tx_size) {
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-      else
-        vp9_fdct32x32(src_diff, coeff, diff_stride);
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
@@ -439,7 +442,7 @@
   }
 
   if (p->eobs[block])
-    *(args->skip_coeff) = 0;
+    *(args->skip) = 0;
 
   if (x->skip_encode || p->eobs[block] == 0)
     return;
@@ -489,7 +492,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -503,7 +506,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
 
   if (!x->skip_recode)
     vp9_subtract_sb(x, bsize);
@@ -536,10 +539,12 @@
   uint8_t *src, *dst;
   int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
   int i, j;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)];
-  src = &p->src.buf[4 * (j * p->src.stride + i)];
+  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
+  src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   // if (x->optimize)
@@ -551,22 +556,19 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(32, 32, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
-        if (x->use_lp32x32fdct)
-          vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
-        else
-          vp9_fdct32x32(src_diff, coeff, diff_stride);
+                           src, src_stride, dst, dst_stride);
+        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, p->zbin_extra, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -574,11 +576,11 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -586,7 +588,7 @@
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -594,11 +596,11 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
@@ -606,7 +608,7 @@
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
@@ -618,12 +620,12 @@
 
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
-                              x->skip_encode ? p->src.stride : pd->dst.stride,
-                              dst, pd->dst.stride, i, j, plane);
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
 
       if (!x->skip_recode) {
         vp9_subtract_block(4, 4, src_diff, diff_stride,
-                           src, p->src.stride, dst, pd->dst.stride);
+                           src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
@@ -639,29 +641,29 @@
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
+          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
-          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
     default:
       assert(0);
   }
   if (*eob)
-    *(args->skip_coeff) = 0;
+    *(args->skip) = 0;
 }
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            unsigned char *skip_coeff) {
-  struct encode_b_args arg = {x, NULL, skip_coeff};
+                            unsigned char *skip) {
+  struct encode_b_args arg = {x, NULL, skip};
   encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 }
 
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip_coeff};
+  struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip};
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
                                          &arg);

diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index cf7097d..515935f 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h

@@ -32,7 +32,7 @@
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            unsigned char *skip_coeff);
+                            unsigned char *skip);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5efb00a..ddb901d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -65,7 +65,7 @@
 
   double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
+  for (i = 0; i < QINDEX_RANGE; ++i) {
     if (target_q <= vp9_convert_qindex_to_q(i)) {
       ret_val = i;
       break;
@@ -106,12 +106,12 @@
 }
 
 
-// Read frame stats at an offset from the current position
+// Read frame stats at an offset from the current position.
 static int read_frame_stats(const struct twopass_rc *p,
                             FIRSTPASS_STATS *frame_stats, int offset) {
   const FIRSTPASS_STATS *fps_ptr = p->stats_in;
 
-  // Check legality of offset
+  // Check legality of offset.
   if (offset >= 0) {
     if (&fps_ptr[offset] >= p->stats_in_end)
       return EOF;
@@ -144,7 +144,6 @@
 
 // TEMP debug code
 #if OUTPUT_FPF
-
   {
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
@@ -377,7 +376,6 @@
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const ref = xd->plane[0].pre[0].buf;
   const int ref_stride = xd->plane[0].pre[0].stride;
-
   unsigned int sse;
   vp9_variance_fn_t fn = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type);
   fn(src, src_stride, ref, ref_stride, &sse);
@@ -398,18 +396,18 @@
   int new_mv_mode_penalty = 256;
   const int quart_frm = MIN(cpi->common.width, cpi->common.height);
 
-  // refine the motion search range accroding to the frame dimension
-  // for first pass test
+  // Refine the motion search range according to the frame dimension
+  // for first pass test.
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
-    sr++;
+    ++sr;
 
   step_param += sr;
   further_steps -= sr;
 
-  // override the default variance function to use MSE
+  // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
 
-  // Initial step/diamond search centred on best mv
+  // Center the initial step/diamond search on best mv.
   tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                     step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
@@ -424,15 +422,15 @@
     best_mv->col = tmp_mv.col;
   }
 
-  // Further step/diamond searches as necessary
+  // Carry out further step/diamond searches as necessary.
   n = num00;
   num00 = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
-      num00--;
+      --num00;
     } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
@@ -497,14 +495,14 @@
   struct twopass_rc *const twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
   setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
-  xd->mi_8x8[0] = cm->mi;  // required for vp9_frame_init_quantizer
+  xd->mi_8x8[0] = cm->mi;
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -521,34 +519,32 @@
   vp9_init_mv_probs(cm);
   vp9_initialize_rd_consts(cpi);
 
-  // tiling is ignored in the first pass
+  // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     int_mv best_ref_mv;
 
     best_ref_mv.as_int = 0;
 
-    // reset above block coeffs
+    // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
     // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders
+    // outside the UMV borders.
     x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + BORDER_MV_PIXELS_B16;
 
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
-      vp9_clear_system_state();  // __asm emms;
+      vp9_clear_system_state();
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -566,15 +562,15 @@
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
-      // do intra 16x16 prediction
+      // Do intra 16x16 prediction.
       this_error = vp9_encode_intra(x, use_dc_pred);
       if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_clear_system_state();  // __asm emms;
-        this_error *= error_weight;
+        vp9_clear_system_state();
+        this_error = (int)(this_error * error_weight);
       }
 
-      // intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (eg a plain black frame).
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
       // We do not have special cases in first pass for 0,0 and nearest etc so
       // all inter modes carry an overhead cost estimate for the mv.
       // When the error score is very low this causes us to pick all or lots of
@@ -582,7 +578,7 @@
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
-      // Cumulative intra error total
+      // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending
@@ -590,23 +586,23 @@
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-      // Other than for the first frame do a motion search
+      // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
         int tmp_err, motion_error;
         int_mv mv, tmp_mv;
 
         xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
         motion_error = zz_motion_search(cpi, x);
-        // Simple 0,0 motion with no mv overhead
+        // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
         // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search
+        // starting point (best reference) for the search.
         first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
                                  &motion_error);
         if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();  // __asm emms;
-          motion_error *= error_weight;
+          vp9_clear_system_state();
+          motion_error = (int)(motion_error * error_weight);
         }
 
         // If the current best reference mv is not centered on 0,0 then do a 0,0
@@ -616,8 +612,8 @@
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &tmp_err);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            tmp_err *= error_weight;
+            vp9_clear_system_state();
+            tmp_err = (int)(tmp_err * error_weight);
           }
 
           if (tmp_err < motion_error) {
@@ -626,9 +622,9 @@
           }
         }
 
-        // Experimental search in an older reference frame
+        // Search in an older reference frame.
         if (cm->current_video_frame > 1) {
-          // Simple 0,0 motion with no mv overhead
+          // Assume 0,0 motion with no mv overhead.
           int gf_motion_error;
 
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
@@ -637,22 +633,22 @@
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();  // __asm emms;
-            gf_motion_error *= error_weight;
+            vp9_clear_system_state();
+            gf_motion_error = (int)(gf_motion_error * error_weight);
           }
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            second_ref_count++;
+            ++second_ref_count;
 
-          // Reset to last frame as reference buffer
+          // Reset to last frame as reference buffer.
           xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
           xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
           xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
 
-          // In accumulating a score for the older reference frame
-          // take the best of the motion predicted score and
-          // the intra coded error (just as will be done for)
-          // accumulation of "coded_error" for the last frame.
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
           if (gf_motion_error < this_error)
             sr_coded_error += gf_motion_error;
           else
@@ -660,17 +656,16 @@
         } else {
           sr_coded_error += motion_error;
         }
-        /* Intra assumed best */
+        // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
         if (motion_error <= this_error) {
-          // Keep a count of cases where the inter and intra were
-          // very close and very low. This helps with scene cut
-          // detection for example in cropped clips with black bars
-          // at the sides or top and bottom.
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
               this_error < 2 * intrapenalty)
-            neutral_count++;
+            ++neutral_count;
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
@@ -687,43 +682,42 @@
           sum_mvc_abs += abs(mv.as_mv.col);
           sum_mvrs += mv.as_mv.row * mv.as_mv.row;
           sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
+          ++intercount;
 
           best_ref_mv.as_int = mv.as_int;
 
-          // Was the vector non-zero
           if (mv.as_int) {
-            mvcount++;
+            ++mvcount;
 
-            // Was it different from the last non zero vector
+            // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
+              ++new_mv_count;
             lastmv_as_int = mv.as_int;
 
-            // Does the Row vector point inwards or outwards
+            // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
 
-            // Does the Row vector point inwards or outwards
+            // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
           }
         }
@@ -732,7 +726,7 @@
       }
       coded_error += (int64_t)this_error;
 
-      // adjust to the next column of macroblocks
+      // Adjust to the next column of MBs.
       x->plane[0].src.buf += 16;
       x->plane[1].src.buf += uv_mb_height;
       x->plane[2].src.buf += uv_mb_height;
@@ -741,24 +735,24 @@
       recon_uvoffset += uv_mb_height;
     }
 
-    // adjust to the next row of mbs
+    // Adjust to the next row of MBs.
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
     x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
     x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
                            uv_mb_height * cm->mb_cols;
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   {
     FIRSTPASS_STATS fps;
 
     fps.frame = cm->current_video_frame;
-    fps.intra_error = intra_error >> 8;
-    fps.coded_error = coded_error >> 8;
-    fps.sr_coded_error = sr_coded_error >> 8;
+    fps.intra_error = (double)(intra_error >> 8);
+    fps.coded_error = (double)(coded_error >> 8);
+    fps.sr_coded_error = (double)(sr_coded_error >> 8);
     fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
     fps.pcnt_inter = (double)intercount / cm->MBs;
@@ -792,14 +786,14 @@
     // cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
-    // don't want to do output stats with a stack variable!
+    // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
     output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also dont allow it to lag too far
+  // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
        (twopass->this_frame_stats.pcnt_inter > 0.20) &&
@@ -808,9 +802,9 @@
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     twopass->sr_update_lag = 1;
   } else {
-    twopass->sr_update_lag++;
+    ++twopass->sr_update_lag;
   }
-  // swap frame pointers so last frame refers to the frame we just compressed
+  // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
 
   vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
@@ -820,7 +814,7 @@
   if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
 
-  // use this to see what the first pass reconstruction looks like
+  // Use this to see what the first pass reconstruction looks like.
   if (0) {
     char filename[512];
     FILE *recon_file;
@@ -836,15 +830,11 @@
     fclose(recon_file);
   }
 
-  cm->current_video_frame++;
+  ++cm->current_video_frame;
 }
 
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
+// Estimate a cost per mb attributable to overheads such as the coding of modes
+// and motion vectors. This currently makes simplistic assumptions for testing.
 static double bitcost(double prob) {
   return -(log(prob) / log(2.0));
 }
@@ -867,18 +857,17 @@
   motion_cost = bitcost(av_pct_motion);
   intra_cost = bitcost(av_intra);
 
-  // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
+  // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
-  // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
+  // Produce a crude estimate of the overhead cost from modes. We shift (<< 9)
+  // to match the scaling of number of bits by 512.
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
            (av_intra * intra_cost)) * cpi->common.MBs) << 9;
 
-  // return mv_cost + mode_cost;
   // TODO(paulwilkins): Fix overhead costs for extended Q range.
 #endif
   return 0;
@@ -895,7 +884,7 @@
   const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
                                 pt_high);
 
-  // Calculate correction factor
+  // Calculate correction factor.
   if (power_term < 1.0)
     assert(error_term >= 0.0);
 
@@ -921,7 +910,7 @@
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+  for (q = rc->best_quality; q < rc->worst_quality; ++q) {
     const double err_correction_factor = calc_correction_factor(err_per_mb,
                                              ERR_DIVISOR, 0.5, 0.90, q);
     const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
@@ -954,11 +943,11 @@
   twopass->total_stats = *twopass->stats_in_end;
   twopass->total_left_stats = twopass->total_stats;
 
-  // each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant.   The frame rate prior to the first frame
-  // encoded in the second pass is a guess.  However the sum duration is not.
-  // Its calculated based on the actual durations of all frames from the first
-  // pass.
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
   vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count /
                         twopass->total_stats.duration);
 
@@ -969,18 +958,18 @@
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF
+  // are still boosted appropriately for KF/GF/ARF.
   twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
   twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
-  // This variable monitors how far behind the second ref update is lagging
+  // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
   // Scan the first pass file and calculate an average Intra / Inter error score
   // ratio for the sequence.
   {
     double sum_iiratio = 0.0;
-    start_pos = twopass->stats_in;  // Note the starting "file" position.
+    start_pos = twopass->stats_in;
 
     while (input_stats(twopass, &this_frame) != EOF) {
       const double iiratio = this_frame.intra_error /
@@ -991,7 +980,6 @@
     twopass->avg_iiratio = sum_iiratio /
         DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count);
 
-    // Reset file position
     reset_fpf_position(twopass, start_pos);
   }
 
@@ -1001,7 +989,7 @@
     double av_error = twopass->total_stats.ssim_weighted_pred_err /
                       DOUBLE_DIVIDE_CHECK(twopass->total_stats.count);
 
-    start_pos = twopass->stats_in;  // Note starting "file" position
+    start_pos = twopass->stats_in;
 
     twopass->modified_error_total = 0.0;
     twopass->modified_error_min =
@@ -1022,8 +1010,8 @@
 void vp9_end_second_pass(VP9_COMP *cpi) {
 }
 
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
 static double get_prediction_decay_rate(const VP9_COMMON *cm,
                                         const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
@@ -1056,9 +1044,8 @@
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
 
-    // Look ahead a few frames to see if static condition
-    // persists...
-    for (j = 0; j < still_interval; j++) {
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
@@ -1068,7 +1055,7 @@
 
     reset_fpf_position(&cpi->twopass, position);
 
-    // Only if it does do we signal a transition to still
+    // Only if it does do we signal a transition to still.
     if (j == still_interval)
       trans_to_still = 1;
   }
@@ -1078,7 +1065,7 @@
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
-// reflect this
+// reflect this.
 static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
@@ -1091,7 +1078,7 @@
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
-    // comapred to pcnt_inter.
+    // compared to pcnt_inter.
     if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
         next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
@@ -1100,7 +1087,7 @@
   return flash_detected;
 }
 
-// Update the motion related elements to the GF arf boost calculation
+// Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(
   FIRSTPASS_STATS *this_frame,
   double *this_frame_mv_in_out,
@@ -1112,13 +1099,13 @@
   // Accumulate motion stats.
   motion_pct = this_frame->pcnt_motion;
 
-  // Accumulate Motion In/Out of frame stats
+  // Accumulate Motion In/Out of frame stats.
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
   *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is. (A ratio of absmv / mv)
+  // the motion field is (a ratio of absmv / mv).
   if (motion_pct > 0.05) {
     const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
@@ -1141,7 +1128,7 @@
                                double this_frame_mv_in_out) {
   double frame_boost;
 
-  // Underlying boost factor is based on inter intra error ratio
+  // Underlying boost factor is based on inter intra error ratio.
   if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
@@ -1149,13 +1136,12 @@
     frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
-  // Increase boost for frames where new data coming into frame
-  // (eg zoom out). Slightly reduce boost if there is a net balance
-  // of motion out of the frame (zoom in).
-  // The range for this_frame_mv_in_out is -1.0 to +1.0
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In extreme case boost is halved
+  // In the extreme case the boost is halved.
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
@@ -1177,12 +1163,12 @@
   int arf_boost;
   int flash_detected = 0;
 
-  // Search forward from the proposed arf/next gf position
-  for (i = 0; i < f_frames; i++) {
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1193,7 +1179,7 @@
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1206,7 +1192,7 @@
 
   *f_boost = (int)boost_score;
 
-  // Reset for backward looking loop
+  // Reset for backward looking loop.
   boost_score = 0.0;
   mv_ratio_accumulator = 0.0;
   decay_accumulator = 1.0;
@@ -1214,12 +1200,12 @@
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
 
-  // Search backward towards last gf position
-  for (i = -1; i >= -b_frames; i--) {
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
     if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1230,7 +1216,7 @@
     flash_detected = detect_flash(twopass, i + offset) ||
                      detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
       decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
@@ -1280,8 +1266,7 @@
     return;
   }
 
-  // ARF Group: work out the ARF schedule.
-  // Mark ARF frames as negative.
+  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
   if (end < 0) {
     // printf("start:%d end:%d\n", -end, -end);
     // ARF frame is at the end of the range.
@@ -1404,14 +1389,14 @@
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
 
-  double loop_decay_rate = 1.00;          // Starting decay rate
+  double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  const int max_bits = frame_max_bits(cpi);     // Max for a single frame
+  const int max_bits = frame_max_bits(cpi);  // Max bits for a single frame.
 
   unsigned int allow_alt_ref = cpi->oxcf.play_alternate &&
                                cpi->oxcf.lag_in_frames;
@@ -1424,19 +1409,19 @@
 
   twopass->gf_group_bits = 0;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
 
-  // Note the error of the frame at the start of the group (this will be
-  // the GF frame error if we code a normal gf
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
   gf_first_frame_err = mod_frame_err;
 
   // If this is a key frame or the overlay from a previous arf then
-  // The error score / cost of this frame has already been accounted for.
+  // the error score / cost of this frame has already been accounted for.
   if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
@@ -1458,9 +1443,9 @@
 
   i = 0;
   while (i < twopass->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    i++;    // Increment the loop counter
+    ++i;
 
-    // Accumulate error score of frames in this gf group
+    // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
@@ -1471,13 +1456,13 @@
     // quality back to an earlier frame is then restored.
     flash_detected = detect_flash(twopass, 0);
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
@@ -1490,8 +1475,8 @@
                                       next_frame.pcnt_motion;
       }
 
-      // Break clause to detect very still sections after motion
-      // (for example a static image after a fade or other transition).
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
@@ -1499,16 +1484,16 @@
       }
     }
 
-    // Calculate a boost number for this frame
+    // Calculate a boost number for this frame.
     boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static
+      // Break at cpi->max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
-        // Don't break out with a very short interval
+        // Don't break out with a very short interval.
         (i > MIN_GF_INTERVAL) &&
         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
         (!flash_detected) &&
@@ -1527,10 +1512,10 @@
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
-  // Don't allow a gf too near the next kf
+  // Don't allow a gf too near the next kf.
   if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
-      i++;
+      ++i;
 
       if (EOF == input_stats(twopass, this_frame))
         break;
@@ -1560,20 +1545,14 @@
   else
     rc->baseline_gf_interval = i;
 
-  // Should we use the alternate reference frame
+  // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
-      // for real scene cuts (not forced kfs) dont allow arf very near kf.
+      // For real scene cuts (not forced kfs) don't allow arf very near kf.
       (rc->next_key_frame_forced ||
-        (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
-      ((next_frame.pcnt_inter > 0.75) ||
-       (next_frame.pcnt_second_ref > 0.5)) &&
-      ((mv_in_out_accumulator / (double)i > -0.2) ||
-       (mv_in_out_accumulator > -2.0)) &&
-      (boost_score > 100)) {
-
-    // Alternative boost calculation for alt ref
+      (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
+    // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
                                    &b_boost);
     rc->source_alt_ref_pending = 1;
@@ -1635,7 +1614,7 @@
 #endif
 #endif
 
-  // Calculate the bits to be allocated to the group as a whole
+  // Calculate the bits to be allocated to the group as a whole.
   if (twopass->kf_group_bits > 0 && twopass->kf_group_error_left > 0) {
     twopass->gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
@@ -1647,11 +1626,11 @@
      twopass->kf_group_bits : twopass->gf_group_bits;
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
+  // variability limit, cpi->oxcf.two_pass_vbrmax_section.
   if (twopass->gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
     twopass->gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
-  // Reset the file position
+  // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
   // Assign  bits to the arf or gf.
@@ -1663,7 +1642,7 @@
 
     int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
-    // Set max and minimum boost and hence minimum allocation
+    // Set max and minimum boost and hence minimum allocation.
     boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
     if (rc->source_alt_ref_pending && i == 0)
@@ -1671,7 +1650,7 @@
     else
       allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
 
-    // Prevent overflow
+    // Prevent overflow.
     if (boost > 1023) {
       int divisor = boost >> 10;
       boost /= divisor;
@@ -1679,13 +1658,13 @@
     }
 
     // Calculate the number of bits to be spent on the gf or arf based on
-    // the boost number
+    // the boost number.
     gf_bits = (int)((double)boost * (twopass->gf_group_bits /
                   (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
-    // based on the error score of the frame itself
+    // based on the error score of the frame itself.
     if (rc->baseline_gf_interval < 1 ||
         mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
       double alt_gf_grp_bits = (double)twopass->kf_group_bits  *
@@ -1709,7 +1688,7 @@
         gf_bits = alt_gf_bits;
     }
 
-    // Dont allow a negative value for gf_bits
+    // Don't allow a negative value for gf_bits.
     if (gf_bits < 0)
       gf_bits = 0;
 
@@ -1719,27 +1698,27 @@
     if (i == 1 ||
         (!rc->source_alt_ref_pending &&
          cpi->common.frame_type != KEY_FRAME)) {
-      // Per frame bit target for this frame
+      // Calculate the per frame bit target for this frame.
       vp9_rc_set_frame_target(cpi, gf_bits);
     }
   }
 
   {
-    // Adjust KF group bits and error remaining
+    // Adjust KF group bits and error remaining.
     twopass->kf_group_error_left -= (int64_t)gf_group_err;
     twopass->kf_group_bits -= twopass->gf_group_bits;
 
     if (twopass->kf_group_bits < 0)
       twopass->kf_group_bits = 0;
 
-    // If this is an arf update we want to remove the score for the
-    // overlay frame at the end which will usually be very cheap to code.
-    // The overlay frame has already in effect been coded so we want to spread
-    // the remaining bits amoung the other frames/
+    // If this is an arf update we want to remove the score for the overlay
+    // frame at the end which will usually be very cheap to code.
+    // The overlay frame has already, in effect, been coded so we want to spread
+    // the remaining bits among the other frames.
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
     if (rc->source_alt_ref_pending) {
-      twopass->gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
+      twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
     } else if (cpi->common.frame_type != KEY_FRAME) {
       twopass->gf_group_error_left = (int64_t)(gf_group_err
                                                    - gf_first_frame_err);
@@ -1753,7 +1732,7 @@
       twopass->gf_group_bits = 0;
 
     // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
+    // despite MIN_GF_INTERVAL and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (rc->baseline_gf_interval >= 3) {
       const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
@@ -1773,7 +1752,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_pos);
 
-    for (i = 0; i < rc->baseline_gf_interval; i++) {
+    for (i = 0; i < rc->baseline_gf_interval; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -1829,20 +1808,18 @@
                              const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
 
-  // Does the frame satisfy the primary criteria of a key frame
-  //      If so, then examine how well it predicts subsequent frames
+  // Does the frame satisfy the primary criteria of a key frame?
+  // If so, then examine how well it predicts subsequent frames.
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
@@ -1856,37 +1833,34 @@
 
     local_next_frame = *next_frame;
 
-    // Note the starting file position so we can reset to it
+    // Note the starting file position so we can reset to it.
     start_pos = cpi->twopass.stats_in;
 
-    // Examine how well the key frame predicts subsequent frames
-    for (i = 0; i < 16; i++) {
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
       double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
 
-      // Cumulative effect of decay in prediction quality
+      // Cumulative effect of decay in prediction quality.
       if (local_next_frame.pcnt_inter > 0.85)
         decay_accumulator *= local_next_frame.pcnt_inter;
       else
         decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-      // Keep a running total
+      // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
-      // Test various breakout clauses
+      // Test various breakout clauses.
       if ((local_next_frame.pcnt_inter < 0.05) ||
           (next_iiratio < 1.5) ||
           (((local_next_frame.pcnt_inter -
              local_next_frame.pcnt_neutral) < 0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)
-         ) {
+          (local_next_frame.intra_error < 200)) {
         break;
       }
 
@@ -1933,23 +1907,23 @@
 
   vp9_zero(next_frame);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   start_position = twopass->stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
-  // is this a forced key frame by interval
+  // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
 
-  // Clear the alt ref active flag as this can never be active on a key frame
+  // Clear the alt ref active flag as this can never be active on a key frame.
   rc->source_alt_ref_active = 0;
 
-  // Kf is always a gf so clear frames till next gf counter
+  // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
   rc->frames_to_key = 1;
 
-  // Take a copy of the initial frame details
+  // Take a copy of the initial frame details.
   first_frame = *this_frame;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
@@ -1957,75 +1931,74 @@
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
 
-  // find the next keyframe
+  // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // load a the next frame's stats
+    // Load the next frame's stats.
     last_frame = *this_frame;
     input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
     if (cpi->oxcf.auto_key &&
         lookup_next_frame_stats(twopass, &next_frame) != EOF) {
-      // Normal scene cut check
+      // Check for a scene cut.
       if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
 
-
-      // How fast is prediction quality decaying
+      // How fast is the prediction quality decaying?
       loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concened with decay in prediction
+      // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++)
+      for (j = 0; j < 8; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
-      // to a static scene.
+      // static scene.
       if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
-      // Step on to the next frame
-      rc->frames_to_key++;
+      // Step on to the next frame.
+      ++rc->frames_to_key;
 
       // If we don't have a real key frame within the next two
-      // forcekeyframeevery intervals then break out of the loop.
+      // key_frame_frequency intervals then break out of the loop.
       if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
     } else {
-      rc->frames_to_key++;
+      ++rc->frames_to_key;
     }
-    i++;
+    ++i;
   }
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural
-  // interval is between 1x and 2x
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
   if (cpi->oxcf.auto_key &&
       rc->frames_to_key > (int)cpi->key_frame_frequency) {
     FIRSTPASS_STATS tmp_frame;
 
     rc->frames_to_key /= 2;
 
-    // Copy first frame details
+    // Copy first frame details.
     tmp_frame = first_frame;
 
-    // Reset to the start of the group
+    // Reset to the start of the group.
     reset_fpf_position(twopass, start_position);
 
     kf_group_err = 0;
 
-    // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < rc->frames_to_key; i++) {
-      // Accumulate kf group errors
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      // Accumulate kf group errors.
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
 
       // Load the next frame's stats.
@@ -2038,22 +2011,22 @@
     rc->next_key_frame_forced = 0;
   }
 
-  // Special case for the last key frame of the file
+  // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
-    // Accumulate kf group error
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
   if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
-    // Max for a single normal frame (not key frame)
+    // Maximum number of bits for a single normal frame (not key frame).
     int max_bits = frame_max_bits(cpi);
 
-    // Maximum bits for the kf group
+    // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
     // Default allocation based on bits left and relative
-    // complexity of the section
+    // complexity of the section.
     twopass->kf_group_bits = (int64_t)(twopass->bits_left *
        (kf_group_err / twopass->modified_error_left));
 
@@ -2064,7 +2037,7 @@
   } else {
     twopass->kf_group_bits = 0;
   }
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
   // Determine how big to make this keyframe based on how well the subsequent
@@ -2073,7 +2046,7 @@
   boost_score = 0.0;
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < rc->frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; ++i) {
     double r;
 
     if (EOF == input_stats(twopass, &next_frame))
@@ -2098,7 +2071,7 @@
       if (r > RMAX)
         r = RMAX;
 
-      // How fast is prediction quality decaying
+      // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
         decay_accumulator *= loop_decay_rate;
@@ -2116,7 +2089,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(twopass, start_position);
 
-    for (i = 0; i < rc->frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; ++i) {
       input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -2127,10 +2100,10 @@
         DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
-  // Reset the first pass file position
+  // Reset the first pass file position.
   reset_fpf_position(twopass, start_position);
 
-  // Work out how many bits to allocate for the key frame itself
+  // Work out how many bits to allocate for the key frame itself.
   if (1) {
     int kf_boost = (int)boost_score;
     int allocation_chunks;
@@ -2147,25 +2120,26 @@
     rc->kf_boost = kf_boost;
     twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
-    // We do three calculations for kf size.
-    // The first is based on the error score for the whole kf group.
-    // The second (optionally) on the key frames own error if this is
-    // smaller than the average for the group.
-    // The final one insures that the frame receives at least the
-    // allocation it would have received based on its own error score vs
-    // the error score remaining
-    // Special case if the sequence appears almost totaly static
-    // In this case we want to spend almost all of the bits on the
-    // key frame.
-    // cpi->rc.frames_to_key-1 because key frame itself is taken
-    // care of by kf_boost.
+    // Key frame size depends on:
+    // (1) the error score for the whole key frame group,
+    // (2) the key frames' own error if this is smaller than the
+    //     average for the group (optional),
+    // (3) insuring that the frame receives at least the allocation it would
+    //     have received based on its own error score vs the error score
+    //     remaining.
+    // Special case:
+    // If the sequence appears almost totally static we want to spend almost
+    // all of the bits on the key frame.
+    //
+    // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is
+    // taken care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
       allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
       allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
-    // Prevent overflow
+    // Prevent overflow.
     if (kf_boost > 1028) {
       int divisor = kf_boost >> 10;
       kf_boost /= divisor;
@@ -2175,7 +2149,7 @@
     twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0
            : twopass->kf_group_bits;
 
-    // Calculate the number of bits to be spent on the key frame
+    // Calculate the number of bits to be spent on the key frame.
     twopass->kf_bits = (int)((double)kf_boost *
         ((double)twopass->kf_group_bits / allocation_chunks));
 
@@ -2194,9 +2168,9 @@
       if (twopass->kf_bits > alt_kf_bits)
         twopass->kf_bits = alt_kf_bits;
     } else {
-    // Else if it is much harder than other frames in the group make sure
-    // it at least receives an allocation in keeping with its relative
-    // error score
+      // Else if it is much harder than other frames in the group make sure
+      // it at least receives an allocation in keeping with its relative
+      // error score.
       alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
 
@@ -2209,7 +2183,7 @@
     vp9_rc_set_frame_target(cpi, twopass->kf_bits);
   }
 
-  // Note the total error score of the kf group minus the key frame itself
+  // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
@@ -2227,7 +2201,7 @@
   } else {
     cm->frame_type = INTER_FRAME;
   }
-  // Do not use periodic key frames
+  // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 }
 
@@ -2266,13 +2240,6 @@
     twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    // adjust_maxq_qrange(cpi);
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
@@ -2281,19 +2248,19 @@
   this_frame_intra_error = this_frame.intra_error;
   this_frame_coded_error = this_frame.coded_error;
 
-  // keyframe and section processing !
+  // Keyframe and section processing.
   if (rc->frames_to_key == 0 ||
       (cm->frame_flags & FRAMEFLAGS_KEY)) {
-    // Define next KF group and assign bits to it
+    // Define next KF group and assign bits to it.
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   } else {
     cm->frame_type = INTER_FRAME;
   }
 
-  // Is this a GF / ARF (Note that a KF is always also a GF)
+  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
   if (rc->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it
+    // Define next gf group and assign bits to it.
     this_frame_copy = this_frame;
 
 #if CONFIG_MULTIPLE_ARF
@@ -2308,7 +2275,8 @@
 
     if (twopass->gf_zeromotion_pct > 995) {
       // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for show frame, i.e. set allow_encode_breakout to 2.
+      // to enable it for show frame, i.e. set allow_encode_breakout to
+      // ENCODE_BREAKOUT_LIMITED.
       if (!cm->show_frame)
         cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
       else
@@ -2318,8 +2286,8 @@
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
   } else {
-    // Otherwise this is an ordinary frame
-    // Assign bits from those allocated to the GF group
+    // Otherwise this is an ordinary frame.
+    // Assign bits from those allocated to the GF group.
     this_frame_copy =  this_frame;
     assign_std_frame_bits(cpi, &this_frame_copy);
   }
@@ -2341,7 +2309,7 @@
     target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
   vp9_rc_set_frame_target(cpi, target);
 
-  // Update the total stats remaining structure
+  // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
@@ -2351,7 +2319,7 @@
 #else
   cpi->twopass.bits_left -= 8 * bytes_used;
   // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
+  // undershoot on these frames.
   if (cm->frame_type == KEY_FRAME) {
     cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
         cpi->rc.projected_frame_size;

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index c500986..44c1f90 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -29,7 +29,6 @@
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-  unsigned int best_err;
 
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -48,27 +47,22 @@
   ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
-                            0, &v_fn_ptr, 0, ref_mv, dst_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
+                 ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     int distortion;
     unsigned int sse;
-    best_err = cpi->find_fractional_mv_step(
-        x, dst_mv, ref_mv,
-        cpi->common.allow_high_precision_mv,
-        x->errorperbit, &v_fn_ptr,
-        0, cpi->sf.subpel_iters_per_step, NULL, NULL,
-        & distortion, &sse);
+    cpi->find_fractional_mv_step(
+        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion,
+        &sse);
   }
 
   vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv);
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
-  best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                          INT_MAX);
 
   /* restore UMV window */
   x->mv_col_min = tmp_col_min;
@@ -76,7 +70,9 @@
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  return best_err;
+  return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+          INT_MAX);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const int_mv *ref_mv,
@@ -355,7 +351,7 @@
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
       // If any of the blocks in the sequence failed then the MB
       // goes in segment 0
-      if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
         ncnt[0]++;
         cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
       } else {
@@ -423,7 +419,7 @@
                                golden_ref, cpi->Source);
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   separate_arf_mbs(cpi);
 }

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 62b33e4..10dee52e 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -855,6 +855,184 @@
                             square_num_candidates, square_candidates);
 };
 
+// Number of candidates in first hex search
+#define FIRST_HEX_CANDIDATES 6
+// Index of previous hex search's best match
+#define PRE_BEST_CANDIDATE 6
+// Number of candidates in following hex search
+#define NEXT_HEX_CANDIDATES 3
+// Number of candidates in refining search
+#define REFINE_CANDIDATES 4
+
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  static const MV hex[FIRST_HEX_CANDIDATES] = {
+    { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}
+  };
+  static const MV next_chkpts[PRE_BEST_CANDIDATE][NEXT_HEX_CANDIDATES] = {
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+  };
+  static const MV neighbors[REFINE_CANDIDATES] = {
+      {0, -1}, { -1, 0}, {1, 0}, {0, 1}
+  };
+  int i, j;
+
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  int br, bc;
+  MV this_mv;
+  unsigned int bestsad = 0x7fffffff;
+  unsigned int thissad;
+  const uint8_t *base_offset;
+  const uint8_t *this_offset;
+  int k = -1;
+  int best_site = -1;
+  const int max_hex_search = 512;
+  const int max_dia_search = 32;
+
+  const int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+
+  // Adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+
+  // Check the start point
+  base_offset = xd->plane[0].pre[0].buf;
+  this_offset = base_offset + (br * in_what_stride) + bc;
+  this_mv.row = br;
+  this_mv.col = bc;
+  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                             sad_per_bit);
+
+  // Initial 6-point hex search
+  if (check_bounds(x, br, bc, 2)) {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  } else {
+    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
+      this_mv.row = br + hex[i].row;
+      this_mv.col = bc + hex[i].col;
+      if (!is_mv_in(x, &this_mv))
+        continue;
+      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                         bestsad);
+      CHECK_BETTER
+    }
+  }
+
+  // Continue hex search if we find a better match in first round
+  if (best_site != -1) {
+    br += hex[best_site].row;
+    bc += hex[best_site].col;
+    k = best_site;
+
+    // Allow search covering maximum MV range
+    for (j = 1; j < max_hex_search; j++) {
+      best_site = -1;
+
+      if (check_bounds(x, br, bc, 2)) {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < 3; i++) {
+          this_mv.row = br + next_chkpts[k][i].row;
+          this_mv.col = bc + next_chkpts[k][i].col;
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          this_offset = base_offset + (this_mv.row * in_what_stride) +
+              this_mv.col;
+          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                             bestsad);
+          CHECK_BETTER
+        }
+      }
+
+      if (best_site == -1) {
+        break;
+      } else {
+        br += next_chkpts[k][best_site].row;
+        bc += next_chkpts[k][best_site].col;
+        k += 5 + best_site;
+        if (k >= 12) k -= 12;
+        else if (k >= 6) k -= 6;
+      }
+    }
+  }
+
+  // Check 4 1-away neighbors
+  for (j = 0; j < max_dia_search; j++) {
+    best_site = -1;
+
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    } else {
+      for (i = 0; i < REFINE_CANDIDATES; i++) {
+        this_mv.row = br + neighbors[i].row;
+        this_mv.col = bc + neighbors[i].col;
+        if (!is_mv_in(x, &this_mv))
+          continue;
+        this_offset = base_offset + (this_mv.row * in_what_stride) +
+            this_mv.col;
+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
+                           bestsad);
+        CHECK_BETTER
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      br += neighbors[best_site].row;
+      bc += neighbors[best_site].col;
+    }
+  }
+
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  return bestsad;
+}
+
 #undef CHECK_BETTER
 
 int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
@@ -871,10 +1049,10 @@
 
   MV this_mv;
 
-  int bestsad = INT_MAX;
+  unsigned int bestsad = INT_MAX;
   int ref_row, ref_col;
 
-  int thissad;
+  unsigned int thissad;
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
 
   const int *mvjsadcost = x->nmvjointsadcost;
@@ -1289,62 +1467,57 @@
 
 int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
-                           int sadpb, int further_steps,
-                           int do_refine,
+                           int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv) {
-  int_mv temp_mv;
-  int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
-                                        step_param, sadpb, &num00,
+                           const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
                                         fn_ptr, x->nmvjointcost,
                                         x->mvcost, ref_mv);
-  dst_mv->as_int = temp_mv.as_int;
+  *dst_mv = temp_mv;
 
-  n = num00;
-  num00 = 0;
-
-  /* If there won't be more n-step search, check to see if refining search is
-   * needed. */
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv.as_mv,
+      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
                                         ref_mv);
 
-      /* check to see if refining search is needed. */
-      if (num00 > (further_steps - n))
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
         do_refine = 0;
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        dst_mv->as_int = temp_mv.as_int;
+        *dst_mv = temp_mv;
       }
     }
   }
 
-  /* final 1-away diamond refining search */
-  if (do_refine == 1) {
-    int search_range = 8;
-    int_mv best_mv;
-    best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, &best_mv.as_mv, sadpb, search_range,
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
                                        ref_mv);
-
     if (thissme < bestsme) {
       bestsme = thissme;
-      dst_mv->as_int = best_mv.as_int;
+      *dst_mv = best_mv;
     }
   }
+
   return bestsme;
 }
 

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1d6abe..ff4b1df 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -46,7 +46,7 @@
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, int_mv *dst_mv);
+                           const MV *ref_mv, MV *dst_mv);
 
 int vp9_hex_search(const MACROBLOCK *x,
                    MV *ref_mv,
@@ -75,6 +75,14 @@
                       int use_mvcost,
                       const MV *center_mv,
                       MV *best_mv);
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv);
 
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e972355..e11232e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -96,7 +96,7 @@
 void vp9_init_quantizer(VP9_COMP *cpi);
 
 static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
-  {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
@@ -154,20 +154,22 @@
 }
 
 static void dealloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
-  cpi->segmentation_map = 0;
-  vpx_free(cpi->common.last_frame_seg_map);
-  cpi->common.last_frame_seg_map = 0;
+  cpi->segmentation_map = NULL;
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = NULL;
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = 0;
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
 
   vpx_free(cpi->complexity_map);
   cpi->complexity_map = 0;
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
-  vp9_free_frame_buffers(&cpi->common);
+  vp9_free_frame_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
   vp9_free_frame_buffer(&cpi->scaled_source);
@@ -194,19 +196,20 @@
 // to a target value
 // target q value
 int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
   int i;
-  int start_index = cpi->rc.worst_quality;
-  int target_index = cpi->rc.worst_quality;
 
   // Convert the average q value to an index.
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     start_index = i;
     if (vp9_convert_qindex_to_q(i) >= qstart)
       break;
   }
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
     target_index = i;
     if (vp9_convert_qindex_to_q(i) >= qtarget)
       break;
@@ -218,28 +221,23 @@
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a value that should equate to thegiven rate ratio.
 
-int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
-                               double base_q_index, double rate_target_ratio) {
+static int compute_qdelta_by_rate(VP9_COMP *cpi, int base_q_index,
+                                  double rate_target_ratio) {
   int i;
-  int base_bits_per_mb;
-  int target_bits_per_mb;
   int target_index = cpi->rc.worst_quality;
 
-  // Make SURE use of floating point in this function is safe.
-  vp9_clear_system_state();
-
   // Look up the current projected bits per block for the base index
-  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
-                                        base_q_index, 1.0);
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                            base_q_index, 1.0);
 
   // Find the target bits per mb based on the base value and given ratio.
-  target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
 
   // Convert the q target to an index
-  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; ++i) {
     target_index = i;
-    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
-                           i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type, i, 1.0) <=
+            target_bits_per_mb )
       break;
   }
 
@@ -249,11 +247,8 @@
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.
 static void setup_in_frame_q_adj(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
-  // double q_ratio;
-  int segment;
-  int qindex_delta;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   // Make SURE use of floating point in this function is safe.
   vp9_clear_system_state();
@@ -261,13 +256,14 @@
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+
     // Clear down the segment map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
     // Clear down the complexity map used for rd
     vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
 
-    // Enable segmentation
     vp9_enable_segmentation((VP9_PTR)cpi);
     vp9_clearall_segfeatures(seg);
 
@@ -278,9 +274,8 @@
     vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
 
     // Use some of the segments for in frame Q adjustment
-    for (segment = 1; segment < 3; segment++) {
-      qindex_delta =
-        vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+    for (segment = 1; segment < 2; segment++) {
+      const int qindex_delta = compute_qdelta_by_rate(cpi, cm->base_qindex,
                                    in_frame_q_adj_ratio[segment]);
       vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
       vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
@@ -288,8 +283,8 @@
   }
 }
 static void configure_static_seg_features(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
 
   int high_q = (int)(cpi->rc.avg_q > 48.0);
   int qi_delta;
@@ -433,13 +428,13 @@
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
   int row, col;
-  MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    mi_8x8 = mi_8x8_ptr;
-    cache = cache_ptr;
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
       cache[0] = mi_8x8[0]->mbmi.segment_id;
     mi_8x8_ptr += cm->mode_info_stride;
@@ -496,6 +491,18 @@
   sf->thresh_mult[THR_D207_PRED] += 2500;
   sf->thresh_mult[THR_D63_PRED] += 2500;
 
+  // disable using golden frame modes if golden frames are not being used
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX) {
+    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
+    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
+  }
+
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -850,6 +857,8 @@
     }
     sf->frame_parameter_update = 0;
     sf->encode_breakout_thresh = 1000;
+
+    sf->search_method = FAST_HEX;
   }
   if (speed >= 6) {
     sf->always_this_block_size = BLOCK_16X16;
@@ -868,8 +877,10 @@
   if (speed < 0)
     speed = -speed;
 
+#if CONFIG_INTERNAL_STATS
   for (i = 0; i < MAX_MODES; ++i)
     cpi->mode_chosen_counts[i] = 0;
+#endif
 
   // best quality defaults
   sf->frame_parameter_update = 1;
@@ -965,16 +976,17 @@
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
 
-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
                                       cm->subsampling_x, cm->subsampling_y,
-                                      cpi->oxcf.lag_in_frames);
+                                      oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                               cpi->oxcf.width, cpi->oxcf.height,
+                               oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
                                VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -1101,7 +1113,7 @@
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
   VP9_COMMON *const cm = &cpi->common;
-  int64_t vbr_max_bits;
+  int vbr_max_bits;
 
   if (framerate < 0.1)
     framerate = 30;
@@ -1125,10 +1137,10 @@
   // be acheived because of a user specificed max q (e.g. when the user
   // specifies lossless encode.
   //
-  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
-                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  vbr_max_bits = (int)(((int64_t)cpi->rc.av_per_frame_bandwidth *
+      cpi->oxcf.two_pass_vbrmax_section) / 100);
   cpi->rc.max_frame_bandwidth =
-    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
@@ -1149,7 +1161,7 @@
     cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
-static int64_t rescale(int val, int64_t num, int denom) {
+static int64_t rescale(int64_t val, int64_t num, int denom) {
   int64_t llnum = num;
   int64_t llden = denom;
   int64_t llval = val;
@@ -1202,9 +1214,12 @@
     lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
     bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
     // Update buffer-related quantities.
-    lc->starting_buffer_level = oxcf->starting_buffer_level * bitrate_alloc;
-    lc->optimal_buffer_level = oxcf->optimal_buffer_level * bitrate_alloc;
-    lc->maximum_buffer_size = oxcf->maximum_buffer_size * bitrate_alloc;
+    lc->starting_buffer_level =
+        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+    lc->optimal_buffer_level =
+        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+    lc->maximum_buffer_size =
+        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
     lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
     // Update framerate-related quantities.
@@ -1236,8 +1251,8 @@
     int prev_layer_target_bandwidth =
         oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
     lc->avg_frame_size =
-        (int)(lc->target_bandwidth - prev_layer_target_bandwidth) /
-        (lc->framerate - prev_layer_framerate);
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -1265,7 +1280,7 @@
   int temporal_layer = cpi->svc.temporal_layer_id;
   LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
   lc->rc = cpi->rc;
-  lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
   lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
   lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
   lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
@@ -1366,6 +1381,9 @@
 
   cpi->oxcf = *oxcf;
 
+  if (cpi->oxcf.cpu_used == -6)
+    cpi->oxcf.play_alternate = 0;
+
   switch (cpi->oxcf.mode) {
       // Real time and one pass deprecated in test code base
     case MODE_GOODQUALITY:
@@ -1482,10 +1500,10 @@
 
   if (cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+    update_layer_context_change_config(cpi, (int)cpi->oxcf.target_bandwidth);
   }
 
-  cpi->speed = cpi->oxcf.cpu_used;
+  cpi->speed = abs(cpi->oxcf.cpu_used);
 
   if (cpi->oxcf.lag_in_frames == 0) {
     // Force allow_lag to 0 if lag_in_frames is 0.
@@ -1559,6 +1577,7 @@
   int num_pix = num_4x4_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_4x4_blk;
+
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -1602,7 +1621,6 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x  = &cpi->mb;
 
-
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
@@ -2020,10 +2038,12 @@
                   / time_encoded;
 
       if (cpi->b_calculate_psnr) {
-        const double total_psnr = vp9_mse2psnr(cpi->total_samples, 255.0,
-                                               cpi->total_sq_error);
-        const double totalp_psnr = vp9_mse2psnr(cpi->totalp_samples, 255.0,
-                                                cpi->totalp_sq_error);
+        const double total_psnr =
+            vp9_mse2psnr((double)cpi->total_samples, 255.0,
+                         (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vp9_mse2psnr((double)cpi->totalp_samples, 255.0,
+                         (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -2199,12 +2219,12 @@
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const double sse = calc_plane_error(a_planes[i], a_strides[i],
-                                        b_planes[i], b_strides[i],
-                                        w, h);
+    const uint64_t sse = calc_plane_error(a_planes[i], a_strides[i],
+                                          b_planes[i], b_strides[i],
+                                          w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, sse);
+    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, (double) sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -2212,7 +2232,7 @@
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vp9_mse2psnr(total_samples, 255.0, total_sse);
+  psnr->psnr[0] = vp9_mse2psnr((double)total_samples, 255.0, (double)total_sse);
 }
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2725,7 +2745,7 @@
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
   int recon_err;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2769,8 +2789,6 @@
 
     for (i = 0; i < MAX_MODES; ++i)
       fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-    for (i = 0; i < MAX_REFS; ++i)
-      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
 
     fprintf(fmodes, "\n");
 
@@ -2784,7 +2802,7 @@
                                        uint8_t *dest,
                                        int q) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   vp9_set_quantizer(cpi, q);
 
   // Set up entropy context depending on frame type. The decoder mandates
@@ -2795,7 +2813,7 @@
   if (cm->frame_type == KEY_FRAME) {
     vp9_setup_key_frame(cpi);
   } else {
-    if (!cm->intra_only && !cm->error_resilient_mode) {
+    if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
       cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
     }
     vp9_setup_inter_frame(cpi);
@@ -2813,7 +2831,7 @@
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 }
 
 static void encode_with_recode_loop(VP9_COMP *cpi,
@@ -2823,6 +2841,7 @@
                                     int bottom_index,
                                     int top_index) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   int loop_count = 0;
   int loop = 0;
   int overshoot_seen = 0;
@@ -2832,12 +2851,12 @@
   int frame_under_shoot_limit;
 
   // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
                                    &frame_under_shoot_limit,
                                    &frame_over_shoot_limit);
 
   do {
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
     vp9_set_quantizer(cpi, q);
 
@@ -2850,7 +2869,7 @@
       if (cm->frame_type == KEY_FRAME) {
         vp9_setup_key_frame(cpi);
       } else {
-        if (!cm->intra_only && !cm->error_resilient_mode) {
+        if (!cm->intra_only && !cm->error_resilient_mode && !cpi->use_svc) {
           cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         }
         vp9_setup_inter_frame(cpi);
@@ -2872,7 +2891,7 @@
     // seen in the last encoder iteration.
     // update_base_skip_probs(cpi);
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
 
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
@@ -2883,7 +2902,7 @@
       if (!cpi->sf.use_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
-      cpi->rc.projected_frame_size = (*size) << 3;
+      rc->projected_frame_size = (int)(*size) << 3;
       vp9_restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0)
@@ -2894,8 +2913,8 @@
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) &&
-           cpi->rc.this_key_frame_forced &&
-           (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2908,9 +2927,9 @@
         // The key frame is not good enough or we can afford
         // to make it better without undue risk of popping.
         if ((kf_err > high_err_target &&
-             cpi->rc.projected_frame_size <= frame_over_shoot_limit) ||
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
             (kf_err > low_err_target &&
-             cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
           // Lower q_high
           q_high = q > q_low ? q - 1 : q_low;
 
@@ -2918,7 +2937,7 @@
           q = (q * high_err_target) / kf_err;
           q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
-                   cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
           // Raise q_low
           q_low = q < q_high ? q + 1 : q_high;
@@ -2944,10 +2963,10 @@
         // Update correction factor & compute new Q to try...
 
         // Frame is too large
-        if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+        if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
-            q_high = cpi->rc.worst_quality;
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
           q_low = q < q_high ? q + 1 : q_high;
@@ -2961,12 +2980,12 @@
             // Update rate_correction_factor unless
             vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
             }
@@ -2982,7 +3001,7 @@
             q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi, 0);
-            q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
@@ -2995,7 +3014,7 @@
 
             while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
-              q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
             }
@@ -3014,8 +3033,8 @@
     }
 
     // Special case for overlay frame.
-    if (cpi->rc.is_src_frame_alt_ref &&
-        (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
     if (loop) {
@@ -3083,8 +3102,8 @@
   int top_index;
   int bottom_index;
 
-  SPEED_FEATURES *const sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cm->width, cm->height);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
   struct segmentation *const seg = &cm->seg;
 
   set_ext_overrides(cpi);
@@ -3151,7 +3170,11 @@
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
+
+    // By default, encoder assumes decoder can use prev_mi.
+    cm->coding_use_prev_mi = 1;
     if (cm->error_resilient_mode) {
+      cm->coding_use_prev_mi = 0;
       cm->frame_parallel_decoding_mode = 1;
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
@@ -3219,7 +3242,7 @@
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
-    set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH));
+    set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
   }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
@@ -3412,6 +3435,7 @@
 static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
                                 int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
+
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
@@ -3425,12 +3449,12 @@
 int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON             *cm = &cpi->common;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-  const int    subsampling_x = sd->uv_width  < sd->y_width;
-  const int    subsampling_y = sd->uv_height < sd->y_height;
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  VP9_COMMON *cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->uv_width  < sd->y_width;
+  const int subsampling_y = sd->uv_height < sd->y_height;
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
@@ -3662,7 +3686,7 @@
   *size = 0;
 
   // Clear down mmx registers
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   /* find a free buffer for the new frame, releasing the reference previously
    * held.
@@ -3707,8 +3731,9 @@
   xd->interp_kernel = vp9_get_interp_kernel(
       DEFAULT_INTERP_FILTER == SWITCHABLE ? EIGHTTAP : DEFAULT_INTERP_FILTER);
 
-  if (cpi->oxcf.aq_mode == VARIANCE_AQ)
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_vaq_init();
+  }
 
   if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -3748,7 +3773,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (cpi->pass != 1) {
-    cpi->bytes += *size;
+    cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
       cpi->count++;
@@ -3823,22 +3848,23 @@
 
 int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMP *cpi = (VP9_COMP *)comp;
+  VP9_COMMON *cm = &cpi->common;
 
-  if (!cpi->common.show_frame) {
+  if (!cm->show_frame) {
     return -1;
   } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+    ret = vp9_post_proc_frame(cm, dest, flags);
 #else
 
-    if (cpi->common.frame_to_show) {
-      *dest = *cpi->common.frame_to_show;
-      dest->y_width = cpi->common.width;
-      dest->y_height = cpi->common.height;
-      dest->uv_width = cpi->common.width >> cpi->common.subsampling_x;
-      dest->uv_height = cpi->common.height >> cpi->common.subsampling_y;
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
       ret = 0;
     } else {
       ret = -1;

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1ab1814..7bcceed 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -136,7 +136,8 @@
   NSTEP = 1,
   HEX = 2,
   BIGDIA = 3,
-  SQUARE = 4
+  SQUARE = 4,
+  FAST_HEX = 5
 } SEARCH_METHODS;
 
 typedef enum {
@@ -445,7 +446,7 @@
   YV12_BUFFER_CONFIG *un_scaled_source;
   YV12_BUFFER_CONFIG scaled_source;
 
-  unsigned int key_frame_frequency;
+  int key_frame_frequency;
 
   int gold_is_last;  // gold same as last frame ( short circuit gold searches)
   int alt_is_last;  // Alt same as last ( short circuit altref search)
@@ -486,12 +487,6 @@
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
-  unsigned int mode_chosen_counts[MAX_MODES];
-  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
-  int64_t mode_skip_mask;
-  int ref_frame_mask;
-  int set_ref_frame_mask;
-
   int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
   int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
@@ -589,6 +584,8 @@
   int fixed_divide[512];
 
 #if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
   int    count;
   double total_y;
   double total_u;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3b4a7f6..0d0e59e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -98,11 +98,8 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                   sadpb, further_steps, 1,
-                                   &cpi->fn_ptr[bsize],
-                                   &ref_mv.as_mv, tmp_mv);
-
+  vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1,
+                         &cpi->fn_ptr[bsize], &ref_mv.as_mv, &tmp_mv->as_mv);
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
@@ -130,11 +127,52 @@
   // calculate the bit cost on motion vector
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-
-
   return bestsme;
 }
 
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int dis;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  tmp_mv->as_mv.col >>= 3;
+  tmp_mv->as_mv.row >>= 3;
+
+  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                               cpi->common.allow_high_precision_mv,
+                               x->errorperbit,
+                               &cpi->fn_ptr[bsize],
+                               cpi->sf.subpel_force_stop,
+                               cpi->sf.subpel_iters_per_step,
+                               x->nmvjointcost, x->mvcost,
+                               &dis, &x->pred_sse[ref]);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+}
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -154,7 +192,7 @@
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  int64_t cost[4]= { 0, 100, 150,  205 };
+  int64_t cost[4]= { 0, 50, 75, 100 };
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -164,6 +202,7 @@
 
   // initialize mode decisions
   *returnrate = INT_MAX;
+  *returndistortion = INT64_MAX;
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
@@ -172,7 +211,7 @@
                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
                         EIGHTTAP : cpi->common.interp_filter;
-  mbmi->skip_coeff = 0;
+  mbmi->skip = 0;
   mbmi->segment_id = 0;
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
@@ -203,9 +242,6 @@
       int64_t dist;
 
       if (this_mode == NEWMV) {
-        if (this_rd < 300)
-          continue;
-
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
             full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                                      &frame_mv[NEWMV][ref_frame], &rate_mv);
@@ -228,7 +264,14 @@
     }
   }
 
-  // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+  // Perform sub-pixel motion search, if NEWMV is chosen
+  if (mbmi->mode == NEWMV) {
+    ref_frame = mbmi->ref_frame[0];
+    sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                            &frame_mv[NEWMV][ref_frame]);
+    mbmi->mv[0].as_int = frame_mv[NEWMV][ref_frame].as_int;
+    xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+  }
 
   // TODO(jingning) intra prediction search, if the best SAD is above a certain
   // threshold.

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 862573f..372c362 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -26,7 +26,7 @@
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = count, eob = -1;
+  int i, non_zero_count = (int)count, eob = -1;
   const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                          zbin_ptr[1] + zbin_oq_value };
   const int nzbins[2] = { zbins[0] * -1,
@@ -37,7 +37,7 @@
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = count - 1; i >= 0; i--) {
+    for (i = (int)count - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
 

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 2427dbe..f78ebfe 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -354,7 +354,7 @@
   int projected_size_based_on_q = 0;
 
   // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
@@ -499,12 +499,10 @@
   // (at buffer = critical level).
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
-  // int active_worst_quality = rc->active_worst_quality;
-  // Maximum limit for down adjustment, ~20%.
   // Buffer level below which we push active_worst to worst_quality.
-  int critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t buff_lvl_step = 0;
   int adjustment = 0;
-  int buff_lvl_step = 0;
   int active_worst_quality;
   if (cpi->common.frame_type == KEY_FRAME)
     return rc->worst_quality;
@@ -516,10 +514,11 @@
                                rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
   if (rc->buffer_level > oxcf->optimal_buffer_level) {
     // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = (int)((oxcf->maximum_buffer_size -
-          oxcf->optimal_buffer_level) / max_adjustment_down);
+      buff_lvl_step = ((oxcf->maximum_buffer_size -
+                        oxcf->optimal_buffer_level) / max_adjustment_down);
       if (buff_lvl_step)
         adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
                             buff_lvl_step);
@@ -530,9 +529,10 @@
     if (critical_level) {
       buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                         (oxcf->optimal_buffer_level - rc->buffer_level) /
-                             buff_lvl_step;
+        adjustment =
+            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  buff_lvl_step);
       }
       active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
     }
@@ -1151,7 +1151,7 @@
 
   cm->last_frame_type = cm->frame_type;
   // Update rate control heuristics
-  rc->projected_frame_size = (bytes_used << 3);
+  rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
   vp9_rc_update_rate_correction_factors(
@@ -1310,11 +1310,12 @@
   const VP9_CONFIG *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
-  const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
   int target = rc->av_per_frame_bandwidth;
-  if (cpi->use_svc && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
     // Note that for layers, av_per_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -1325,11 +1326,11 @@
   }
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
     target -= (target * pct_low) / 200;
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
-    const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
   return MAX(min_frame_target, target);
@@ -1337,9 +1338,11 @@
 
 static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
+  int target;
 
   if (cpi->common.current_video_frame == 0) {
-    return cpi->oxcf.starting_buffer_level / 2;
+    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
   } else {
     const int initial_boost = 32;
     int kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
@@ -1347,8 +1350,9 @@
       kf_boost = (int)(kf_boost * rc->frames_since_key /
                        (cpi->output_framerate / 2));
     }
-    return ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4;
+    target = ((16 + kf_boost) * rc->av_per_frame_bandwidth) >> 4;
   }
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {

diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 551b6c3..5dbc7d1 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h

@@ -34,17 +34,17 @@
   double key_frame_rate_correction_factor;
   double gf_rate_correction_factor;
 
-  unsigned int frames_since_golden;
-  unsigned int frames_till_gf_update_due;  // Count down till next GF
-  unsigned int max_gf_interval;
-  unsigned int baseline_gf_interval;
-  unsigned int frames_to_key;
-  unsigned int frames_since_key;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-  unsigned int source_alt_ref_pending;
-  unsigned int source_alt_ref_active;
-  unsigned int is_src_frame_alt_ref;
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int max_gf_interval;
+  int baseline_gf_interval;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
 
   int av_per_frame_bandwidth;     // Average frame size target for clip
   int min_frame_bandwidth;        // Minimum allocation used for any frame
@@ -57,8 +57,8 @@
   double tot_q;
   double avg_q;
 
-  int buffer_level;
-  int bits_off_target;
+  int64_t buffer_level;
+  int64_t bits_off_target;
 
   int decimation_factor;
   int decimation_count;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 03def20..70f9fab 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -274,7 +274,7 @@
   MACROBLOCK *x = &cpi->mb;
   int qindex, i;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   // Further tests required to see if optimum is different
   // for key frames, golden frames and arf frames.
@@ -415,9 +415,10 @@
     *dist = 0;
   } else {
     int d_q10, r_q10;
-    uint64_t xsq_q10_64 =
+    const uint64_t xsq_q10_64 =
         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
-    int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? MAX_XSQ_Q10 : xsq_q10_64;
+    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
+                        MAX_XSQ_Q10 : (int)xsq_q10_64;
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
     *rate = (n * r_q10 + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -430,7 +431,9 @@
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  int i, rate_sum = 0, dist_sum = 0;
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
   int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
   unsigned int sse;
 
@@ -444,20 +447,33 @@
 
     if (i == 0)
       x->pred_sse[ref] = sse;
-    if (cpi->sf.use_pick_mode) {
-      dist_sum += (int)sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->speed > 4) {
+      int64_t rate;
+      int64_t dist;
+      int64_t square_error = sse;
+      int quantizer = (pd->dequant[1] >> 3);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> 8;
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
     } else {
       int rate;
       int64_t dist;
       model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
                                pd->dequant[1] >> 3, &rate, &dist);
       rate_sum += rate;
-      dist_sum += (int)dist;
+      dist_sum += dist;
     }
   }
 
-  *out_rate_sum = rate_sum;
-  *out_dist_sum = (int64_t)dist_sum << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum << 4;
 }
 
 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
@@ -650,8 +666,7 @@
     return;
 
   if (!is_inter_block(mbmi))
-    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size,
-                           &mbmi->skip_coeff);
+    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
   else
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
@@ -845,6 +860,11 @@
   }
 }
 
+static int64_t scaled_rd_cost(int rdmult, int rddiv,
+                              int rate, int64_t dist, double scale) {
+  return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
+}
+
 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int (*r)[2], int *rate,
                                           int64_t *d, int64_t *distortion,
@@ -882,10 +902,13 @@
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
+      rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
+                                           scale);
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+      rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
+                                scale);
+      rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
+                                scale);
     }
     if (rd[n][1] < best_rd) {
       best_rd = rd[n][1];
@@ -1820,7 +1843,7 @@
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
                                              &bsi->ref_mv->as_mv,
-                                             new_mv);
+                                             &new_mv->as_mv);
           }
 
           // Should we do a full search (best quality only)
@@ -2425,7 +2448,11 @@
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  if (cpi->sf.search_method == HEX) {
+  if (cpi->sf.search_method == FAST_HEX) {
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == HEX) {
     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
                              &cpi->fn_ptr[bsize], 1,
                              &ref_mv.as_mv, &tmp_mv->as_mv);
@@ -2441,7 +2468,7 @@
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
                                      &cpi->fn_ptr[bsize],
-                                     &ref_mv.as_mv, tmp_mv);
+                                     &ref_mv.as_mv, &tmp_mv->as_mv);
   }
 
   x->mv_col_min = tmp_col_min;
@@ -2687,6 +2714,8 @@
       int_mv tmp_mv;
       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                            &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
@@ -2906,8 +2935,8 @@
       const unsigned int max_thresh = (cpi->allow_encode_breakout ==
                                       ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
       // The encode_breakout input
-      const unsigned int min_thresh = ((x->encode_breakout << 4) > max_thresh) ?
-                                      max_thresh : (x->encode_breakout << 4);
+      const unsigned int min_thresh =
+          MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
       // Calculate threshold according to dequant value.
       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
@@ -3137,6 +3166,8 @@
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
+  int ref_frame_mask = 0;
+  int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3170,13 +3201,12 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
   for (ref_frame = LAST_FRAME;
        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
     int i;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
+        ref_frame_mask |= (1 << ref_frame);
         break;
       }
     }
@@ -3210,28 +3240,28 @@
       if (mode_index == (cpi->sf.mode_skip_start + 1)) {
         switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
           case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
+            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
+            mode_skip_mask = LAST_FRAME_MODE_MASK;
             break;
           case GOLDEN_FRAME:
-            cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
+            mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
             break;
           case ALTREF_FRAME:
-            cpi->mode_skip_mask = ALT_REF_MODE_MASK;
+            mode_skip_mask = ALT_REF_MODE_MASK;
             break;
           case NONE:
           case MAX_REF_FRAMES:
             assert(0 && "Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+      if (mode_skip_mask & (1 << mode_index))
         continue;
     }
 
     // Skip if the current reference frame has been masked off
-    if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
+    if (ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
       continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -3756,6 +3786,8 @@
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
+  int ref_frame_mask = 0;
+  int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3791,13 +3823,12 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  cpi->ref_frame_mask = 0;
   for (ref_frame = LAST_FRAME;
        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
     int i;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        cpi->ref_frame_mask |= (1 << ref_frame);
+        ref_frame_mask |= (1 << ref_frame);
         break;
       }
     }
@@ -3830,23 +3861,23 @@
       if (mode_index == 3) {
         switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
           case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
+            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            cpi->mode_skip_mask = 0x0010;
+            mode_skip_mask = 0x0010;
             break;
           case GOLDEN_FRAME:
-            cpi->mode_skip_mask = 0x0008;
+            mode_skip_mask = 0x0008;
             break;
           case ALTREF_FRAME:
-            cpi->mode_skip_mask = 0x0000;
+            mode_skip_mask = 0x0000;
             break;
           case NONE:
           case MAX_REF_FRAMES:
             assert(0 && "Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+      if (mode_skip_mask & (1 << mode_index))
         continue;
     }
 

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ca11dda..0040477 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -29,7 +29,6 @@
 #include "vpx_scale/vpx_scale.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
 
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
@@ -160,11 +159,9 @@
 
   /*cpi->sf.search_method == HEX*/
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                           &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1, ref_mv);
+  vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
 
-#if ALT_REF_SUBPEL_ENABLED
   // Try sub-pixel MC?
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
@@ -180,7 +177,6 @@
                                            NULL, NULL,
                                            &distortion, &sse);
   }
-#endif
 
   // Restore input state
   x->plane[0].src = src;

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 0f68d83..2be00ff 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -214,7 +214,7 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
+  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
@@ -241,7 +241,7 @@
   while (c < eob) {
     int v = 0;
     int skip_eob = 0;
-    v = qcoeff_ptr[scan[c]];
+    v = qcoeff[scan[c]];
 
     while (!v) {
       add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
@@ -252,12 +252,13 @@
       token_cache[scan[c]] = 0;
       ++c;
       pt = get_coef_context(nb, token_cache, c);
-      v = qcoeff_ptr[scan[c]];
+      v = qcoeff[scan[c]];
     }
 
     add_token(&t, coef_probs[band[c]][pt],
               vp9_dct_value_tokens_ptr[v].extra,
-              vp9_dct_value_tokens_ptr[v].token, skip_eob,
+              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+              (uint8_t)skip_eob,
               counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 
@@ -314,7 +315,7 @@
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, xd, t, cpi->mb.token_cache};
-  if (mbmi->skip_coeff) {
+  if (mbmi->skip) {
     if (!dry_run)
       cm->counts.skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);

diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 1f9cb87..600029b 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c

@@ -19,8 +19,8 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-3)
-#define ENERGY_MAX (3)
+#define ENERGY_MIN (-1)
+#define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
@@ -44,7 +44,7 @@
 double vp9_vaq_rdmult_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return RDMULT_RATIO(energy);
 }
@@ -52,7 +52,7 @@
 double vp9_vaq_inv_q_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return Q_RATIO(-energy);
 }
@@ -63,9 +63,9 @@
 
   assert(ENERGY_SPAN <= MAX_SEGMENTS);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  base_ratio = 1.8;
+  base_ratio = 1.5;
 
   for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
     Q_RATIO(i) = pow(base_ratio, i/3.0);
@@ -75,35 +75,39 @@
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
-  int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                        cm->y_dc_delta_q);
+  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+                                              cm->y_dc_delta_q);
   int i;
 
-  vp9_enable_segmentation((VP9_PTR)cpi);
-  vp9_clearall_segfeatures(seg);
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_clearall_segfeatures(seg);
 
-  seg->abs_delta = SEGMENT_DELTADATA;
+    seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    int qindex_delta, segment_rdmult;
+    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+      int qindex_delta, segment_rdmult;
 
-    if (Q_RATIO(i) == 1) {
-      // No need to enable SEG_LVL_ALT_Q for this segment
-      RDMULT_RATIO(i) = 1;
-      continue;
+      if (Q_RATIO(i) == 1) {
+        // No need to enable SEG_LVL_ALT_Q for this segment
+        RDMULT_RATIO(i) = 1;
+        continue;
+      }
+
+      qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
+      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+
+      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+                                           cm->y_dc_delta_q);
+
+      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
     }
-
-    qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
-    vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-    vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
-
-    segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                         cm->y_dc_delta_q);
-
-    RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
   }
 }
 
@@ -137,11 +141,8 @@
   double energy;
   unsigned int var = block_variance(cpi, x, bs);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  // if (var <= 1000)
-  //   return 0;
-
-  energy = 0.9*(logf(var + 1) - 10.0);
-  return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+  energy = 0.9 * (log(var + 1.0) - 10.0);
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }

diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 5958b48..1795e05 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h

@@ -29,7 +29,7 @@
 }
 
 static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = wb->bit_offset;
+  const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
   if (q == CHAR_BIT -1) {

diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index 2b82d97..b5269ed 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c

@@ -16,7 +16,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -46,7 +46,7 @@
     in3 = _mm_slli_epi16(in3, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -59,7 +59,7 @@
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in3);
     const __m128i r1 = _mm_add_epi16(in1, in2);
     const __m128i r2 = _mm_sub_epi16(in1, in2);
@@ -317,7 +317,7 @@
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -328,7 +328,7 @@
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -390,7 +390,7 @@
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1071,7 +1071,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1228,7 +1228,7 @@
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1239,7 +1239,7 @@
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1303,7 +1303,7 @@
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 852cf86..f3735eb 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -16,7 +16,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -47,7 +47,7 @@
     in1 = _mm_slli_epi16(in1, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -60,7 +60,7 @@
   }
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
+    // Transform 1/2: Add/subtract
     const __m128i r0 = _mm_add_epi16(in0, in1);
     const __m128i r1 = _mm_sub_epi16(in0, in1);
     const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
@@ -315,7 +315,7 @@
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -326,7 +326,7 @@
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -388,7 +388,7 @@
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1069,7 +1069,7 @@
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1226,7 +1226,7 @@
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1237,7 +1237,7 @@
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1301,7 +1301,7 @@
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);

diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 0000000..b8bfa89
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

@@ -0,0 +1,641 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
+};
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg,  dst_reg,  exp_src_lo,  exp_src_hi,  exp_dst_lo,  exp_dst_hi;
+  __m256i sse_reg,  sum_reg,  sse_reg_hi,  res_cmp,  sum_reg_lo,  sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  if (x_offset == 0) {
+    // x_offset = 0 and y_offset = 0
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        // load source and destination
+        src_reg = _mm256_loadu_si256((__m256i const *) (src));
+        dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+        // expend each byte to 2 bytes
+        exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+        exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+        exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+        exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+        // source - dest
+        exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+        exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+        // calculate sum
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+        exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+        sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+        exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+        // calculate sse
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+        sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source + next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                                         (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+          // average between current and next stride source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expend each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t y_offset64;
+        y_offset64 = y_offset;
+        y_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+        y_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load current and next source + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *)
+                          (src + src_stride));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 byte in the destination
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+      if (y_offset == 0) {
+        __m256i src_next_reg;
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // average between source and the next byte following source
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+          // expand each byte to 2 bytes
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = 8  and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i src_next_reg, src_avg;
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // average between previous average to current average
+            src_avg = _mm256_avg_epu8(src_avg, src_reg);
+            // expand each byte to 2 bytes
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      // x_offset = 8  and y_offset = bilin interpolation
+      } else {
+          __m256i filter, pw8, src_next_reg, src_avg;
+#if (ARCH_X86_64)
+          int64_t y_offset64;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          y_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // average between source and the next byte following source
+          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+            // average between source and the next byte following source
+            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+            // merge previous average and current average
+            exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to the source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide the source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // save current source average
+            src_avg = src_reg;
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            dst+= dst_stride;
+          }
+      }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+      if (y_offset == 0) {
+        __m256i filter, pw8, src_next_reg;
+#if (ARCH_X86_64)
+        int64_t x_offset64;
+        x_offset64 = x_offset;
+        x_offset64 <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+        x_offset <<= 5;
+        filter = _mm256_load_si256(
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+        pw8 = _mm256_set1_epi16(8);
+        for (i = 0; i < height ; i++) {
+          // load source and another source starting from the next
+          // following byte + destination
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+          // merge current and next source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // expand each byte to 2 bytes
+          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+          // source - dest
+          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+          // calculate sum
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+          // calculate sse
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+          src+= src_stride;
+          dst+= dst_stride;
+        }
+      // x_offset = bilin interpolation and y_offset = 8
+      } else if (y_offset == 8) {
+          __m256i filter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+#else
+          x_offset <<= 5;
+          filter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+          // add 8 to source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+            // average between previous pack to the current
+            src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // calculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            // save previous pack
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      } else {
+          __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+#if (ARCH_X86_64)
+          int64_t x_offset64, y_offset64;
+          x_offset64 = x_offset;
+          x_offset64 <<= 5;
+          y_offset64 = y_offset;
+          y_offset64 <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
+#else
+          x_offset <<= 5;
+          xfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + x_offset));
+          y_offset <<= 5;
+          yfilter = _mm256_load_si256(
+              (__m256i const *)(bilinear_filters_avx2 + y_offset));
+#endif
+          pw8 = _mm256_set1_epi16(8);
+          // load source and another source starting from the next
+          // following byte
+          src_reg = _mm256_loadu_si256((__m256i const *) (src));
+          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+          // merge current and next stride source
+          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+          // filter the source
+          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+          // add 8 to the source
+          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+          // divide the source by 16
+          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+          // convert each 16 bit to 8 bit to each low and high lane source
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+          for (i = 0; i < height ; i++) {
+            src+= src_stride;
+            // load source and another source starting from the next
+            // following byte + destination
+            src_reg = _mm256_loadu_si256((__m256i const *) (src));
+            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
+            dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+            // merge current and next stride source
+            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // convert each 16 bit to 8 bit to each low and high lane source
+            src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+            // merge previous pack to current pack source
+            exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
+            exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
+
+            // filter the source
+            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
+            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
+
+            // expand each byte to 2 bytes
+            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
+            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
+
+            // add 8 to source
+            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
+            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
+
+            // divide source by 16
+            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
+            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+            // source - dest
+            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
+            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
+
+            // caculate sum
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
+            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
+            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
+            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
+
+            // calculate sse
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
+            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+            src_pack = src_reg;
+            dst+= dst_stride;
+          }
+      }
+  }
+  // sum < 0
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
+  // save the next 8 bytes of each lane of sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
+  // merge the result of sum < 0  with sum to add sign to the next 16 bits
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
+  // add each 8 bytes from every lane of sse and sum
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+
+  // save the next 4 bytes of each lane sse
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
+  // save the next 8 bytes of each lane of sum
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+
+  // add the first 4 bytes to the next 4 bytes sse
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
+  // add the first 8 bytes to the next 8 bytes
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  // extract the low lane and the high lane and add the results
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+  return sum;
+}

diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index c9b90d5..02007a3 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c

@@ -42,6 +42,18 @@
   int *Sum
 );
 
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  int height,
+  unsigned int *sse
+);
+
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                         const unsigned char *ref_ptr, int  recon_stride,
                         int  w, int  h, unsigned int *sse, int *sum,
@@ -155,3 +167,43 @@
   *sse = var;
   return (var - (((int64_t)avg * avg) >> 11));
 }
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           64, &sse);
+  // processing the next 32 elements in parallel
+  unsigned int sse2;
+  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                            x_offset, y_offset,
+                                            dst + 32, dst_stride,
+                                            64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           32, &sse);
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 85e83b8..a448b3c 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -79,6 +79,8 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f5d5b24..39b683e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -178,7 +178,7 @@
 
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
   if (cfg->ts_number_layers > 1) {
-    int i;
+    unsigned int i;
     for (i = 1; i < cfg->ts_number_layers; ++i) {
       if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i-1]) {
         ERROR("ts_target_bitrate entries are not increasing");
@@ -264,7 +264,7 @@
 
 static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
                                        vpx_codec_enc_cfg_t cfg,
-                                       struct vp9_extracfg vp8_cfg) {
+                                       struct vp9_extracfg vp9_cfg) {
   oxcf->version = cfg.g_profile;
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
@@ -305,11 +305,11 @@
     oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
 
   oxcf->target_bandwidth         = cfg.rc_target_bitrate;
-  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+  oxcf->rc_max_intra_bitrate_pct = vp9_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
   oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
-  oxcf->cq_level                = vp8_cfg.cq_level;
+  oxcf->cq_level                = vp9_cfg.cq_level;
   oxcf->fixed_q = -1;
 
   oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
@@ -330,30 +330,30 @@
   // oxcf->kf_min_dist         = cfg.kf_min_dis;
   oxcf->key_freq               = cfg.kf_max_dist;
 
-  oxcf->cpu_used               =  vp8_cfg.cpu_used;
-  oxcf->encode_breakout        =  vp8_cfg.static_thresh;
-  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
-  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->sharpness              =  vp8_cfg.sharpness;
+  oxcf->cpu_used               =  vp9_cfg.cpu_used;
+  oxcf->encode_breakout        =  vp9_cfg.static_thresh;
+  oxcf->play_alternate         =  vp9_cfg.enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  vp9_cfg.noise_sensitivity;
+  oxcf->sharpness              =  vp9_cfg.sharpness;
 
   oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
+  oxcf->output_pkt_list        =  vp9_cfg.pkt_list;
 
-  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
-  oxcf->arnr_type       = vp8_cfg.arnr_type;
+  oxcf->arnr_max_frames = vp9_cfg.arnr_max_frames;
+  oxcf->arnr_strength   = vp9_cfg.arnr_strength;
+  oxcf->arnr_type       = vp9_cfg.arnr_type;
 
-  oxcf->tuning = vp8_cfg.tuning;
+  oxcf->tuning = vp9_cfg.tuning;
 
-  oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows    = vp8_cfg.tile_rows;
+  oxcf->tile_columns = vp9_cfg.tile_columns;
+  oxcf->tile_rows    = vp9_cfg.tile_rows;
 
-  oxcf->lossless = vp8_cfg.lossless;
+  oxcf->lossless = vp9_cfg.lossless;
 
   oxcf->error_resilient_mode         = cfg.g_error_resilient;
-  oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
+  oxcf->frame_parallel_decoding_mode = vp9_cfg.frame_parallel_decoding_mode;
 
-  oxcf->aq_mode = vp8_cfg.aq_mode;
+  oxcf->aq_mode = vp9_cfg.aq_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
 
@@ -365,7 +365,7 @@
     memcpy(oxcf->ts_rate_decimator, cfg.ts_rate_decimator,
            sizeof(cfg.ts_rate_decimator));
   } else if (oxcf->ts_number_layers == 1) {
-    oxcf->ts_target_bitrate[0] = oxcf->target_bandwidth;
+    oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth;
     oxcf->ts_rate_decimator[0] = 1;
   }
 
@@ -639,7 +639,7 @@
 
     *x++ = marker;
     for (i = 0; i < ctx->pending_frame_count; i++) {
-      int this_sz = ctx->pending_frame_sizes[i];
+      unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
 
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
@@ -1045,11 +1045,11 @@
   cpi->svc.temporal_layer_id = data->temporal_layer_id;
   // Checks on valid layer_id input.
   if (cpi->svc.temporal_layer_id < 0 ||
-      cpi->svc.temporal_layer_id >= ctx->cfg.ts_number_layers) {
+      cpi->svc.temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
   if (cpi->svc.spatial_layer_id < 0 ||
-      cpi->svc.spatial_layer_id >= ctx->cfg.ss_number_layers) {
+      cpi->svc.spatial_layer_id >= (int)ctx->cfg.ss_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_OK;

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 41750de..76cbebf 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -60,6 +60,11 @@
   int                     img_setup;
   int                     img_avail;
   int                     invert_tile_order;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
 };
 
 static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
@@ -209,7 +214,7 @@
                        ? sizeof(vp9_stream_info_t)
                        : sizeof(vpx_codec_stream_info_t);
   memcpy(si, &ctx->si, sz);
-  si->sz = sz;
+  si->sz = (unsigned int)sz;
 
   return VPX_CODEC_OK;
 }
@@ -300,16 +305,22 @@
         VP9D_COMP *const pbi = (VP9D_COMP*)optr;
         VP9_COMMON *const cm = &pbi->common;
 
-        cm->get_fb_cb = vp9_get_frame_buffer;
-        cm->release_fb_cb = vp9_release_frame_buffer;
-
         // Set index to not initialized.
         cm->new_fb_idx = -1;
 
-        if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-          vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to initialize internal frame buffers");
-        cm->cb_priv = &cm->int_frame_buffers;
+        if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+          cm->get_fb_cb = ctx->get_ext_fb_cb;
+          cm->release_fb_cb = ctx->release_ext_fb_cb;
+          cm->cb_priv = ctx->ext_priv;
+        } else {
+          cm->get_fb_cb = vp9_get_frame_buffer;
+          cm->release_fb_cb = vp9_release_frame_buffer;
+
+          if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to initialize internal frame buffers");
+          cm->cb_priv = &cm->int_frame_buffers;
+        }
 
         ctx->pbi = optr;
       }
@@ -350,7 +361,11 @@
 
     if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
                                        &time_end_stamp, &flags)) {
+      VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi;
+      VP9_COMMON *const cm = &pbi->common;
       yuvconfig2image(&ctx->img, &sd, user_priv);
+
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       ctx->img_avail = 1;
     }
   }
@@ -447,7 +462,7 @@
     while (data_start < data_end && *data_start == 0)
       data_start++;
 
-    data_sz = data_end - data_start;
+    data_sz = (unsigned int)(data_end - data_start);
   } while (data_start < data_end);
   return res;
 }
@@ -470,6 +485,24 @@
   return img;
 }
 
+static vpx_codec_err_t vp9_set_fb_fn(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (ctx->pbi == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
 static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
                                         vpx_codec_mmap_t *mmap,
                                         vpx_codec_iter_t *iter) {
@@ -703,7 +736,8 @@
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,
   /* vpx_codec_caps_t          caps; */
   vp9_init,         /* vpx_codec_init_fn_t       init; */
   vp9_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
@@ -715,6 +749,7 @@
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    vp9_set_fb_fn,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
   },
   { // NOLINT
     /* encoder functions */

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index c0d973b..27dd6f6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -86,6 +86,7 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 

diff --git a/vp9_spatial_scalable_encoder.c b/vp9_spatial_scalable_encoder.c
index bbbe7ed..9f526b0 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/vp9_spatial_scalable_encoder.c

@@ -75,10 +75,10 @@
 static const uint32_t default_kf_dist = 100;
 
 typedef struct {
-  char *output_filename;
+  const char *input_filename;
+  const char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
-  struct VpxInputContext input_ctx;
 } AppInput;
 
 static const char *exec_name;
@@ -94,8 +94,10 @@
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg;
-  char **argv, **argi, **argj;
+  struct arg arg = {0};
+  char **argv = NULL;
+  char **argi = NULL;
+  char **argj = NULL;
   vpx_codec_err_t res;
 
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
@@ -162,7 +164,7 @@
   if (argv[0] == NULL || argv[1] == 0) {
     usage_exit();
   }
-  app_input->input_ctx.filename = argv[0];
+  app_input->input_filename = argv[0];
   app_input->output_filename = argv[1];
   free(argv);
 
@@ -196,6 +198,7 @@
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
+  FILE *infile = NULL;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -206,8 +209,8 @@
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
 
-  if (!(app_input.input_ctx.file = fopen(app_input.input_ctx.filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_ctx.filename);
+  if (!(infile = fopen(app_input.input_filename, "rb")))
+    die("Failed to open %s for reading\n", app_input.input_filename);
 
   // Initialize codec
   if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
@@ -229,13 +232,13 @@
     die("Failed to open %s for writing\n", app_input.output_filename);
 
   // skip initial frames
-  for (i = 0; i < app_input.frames_to_skip; ++i) {
-    read_yuv_frame(&app_input.input_ctx, &raw);
-  }
+  for (i = 0; i < app_input.frames_to_skip; ++i)
+    vpx_img_read(&raw, infile);
 
   // Encode frames
   while (frame_cnt < app_input.frames_to_code) {
-    if (read_yuv_frame(&app_input.input_ctx, &raw)) break;
+    if (!vpx_img_read(&raw, infile))
+      break;
 
     res = vpx_svc_encode(&svc_ctx, &codec, &raw, pts, frame_duration,
                          VPX_DL_REALTIME);
@@ -255,7 +258,7 @@
 
   printf("Processed %d frames\n", frame_cnt);
 
-  fclose(app_input.input_ctx.file);
+  fclose(infile);
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
   vpx_video_writer_close(writer);

diff --git a/vpx/exports_dec b/vpx/exports_dec
index ed121f7..3ce1499 100644
--- a/vpx/exports_dec
+++ b/vpx/exports_dec

@@ -6,4 +6,5 @@
 text vpx_codec_peek_stream_info
 text vpx_codec_register_put_frame_cb
 text vpx_codec_register_put_slice_cb
+text vpx_codec_set_frame_buffer_functions
 text vpx_codec_set_mem_map

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 0f42a1d..51ca65e 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h

@@ -59,7 +59,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/
+#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
 
 typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
 typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
@@ -218,6 +218,36 @@
 typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_iter_t     *iter);
 
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libvpx needs a frame buffer
+ * to decode the current frame and a function to be called when libvpx does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libvpx will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     External frame buffers will be used by libvpx.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding VP9, the application may be required to pass in at least
+ * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 
 /*\brief eXternal Memory Allocation memory map get iterator
  *
@@ -308,6 +338,7 @@
     vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
     vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
     vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
+    vpx_codec_set_fb_fn_t     set_fb_fn;   /**< \copydoc ::vpx_codec_set_fb_fn_t */
   } dec;
   struct vpx_codec_enc_iface {
     vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */

diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 4f5ba6f..adce476 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c

@@ -23,11 +23,13 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#if defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)
+#ifdef __MINGW32__
 #define strtok_r strtok_s
+#ifndef MINGW_HAS_SECURE_API
 // proto from /usr/x86_64-w64-mingw32/include/sec_api/string_s.h
 _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
-#endif
+#endif  /* MINGW_HAS_SECURE_API */
+#endif  /* __MINGW32__ */
 
 #ifdef _MSC_VER
 #define strdup _strdup

diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index a99e48f..63fdaf3 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c

@@ -226,3 +226,21 @@
 
   return SAVE_STATUS(ctx, res);
 }
+
+vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+    vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = VPX_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = VPX_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d0ac1af..f7dde62 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -297,9 +297,16 @@
   int alt_fb_idx;             /**< alt reference frame frame buffer index */
 } vpx_svc_parameters_t;
 
+/*!\brief  vp9 svc layer parameters
+ *
+ * This defines the spatial and temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and
+ * temporal layer id for the current frame.
+ *
+ */
 typedef struct vpx_svc_layer_id {
-  int spatial_layer_id;
-  int temporal_layer_id;
+  int spatial_layer_id;       /**< Spatial layer id number. */
+  int temporal_layer_id;      /**< Temporal layer id number. */
 } vpx_svc_layer_id_t;
 
 /*!\brief VP8 encoder control function parameter type

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 7356bae..ba18328 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -30,6 +30,7 @@
 #endif
 
 #include "./vpx_codec.h"
+#include "./vpx_frame_buffer.h"
 
   /*!\brief Current ABI version number
    *
@@ -39,7 +40,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_DECODER_ABI_VERSION (2 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
   /*! \brief Decoder capabilities bitfield
    *
@@ -66,6 +67,8 @@
    */
 #define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
                                                       multi-threading */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external
+                                                          frame buffers */
 
 #define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
 #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
@@ -326,6 +329,51 @@
 
   /*!@} - end defgroup cap_put_slice*/
 
+  /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+   *
+   * The following section is required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+   * Calling this function for codecs that don't advertise this capability
+   * will result in an error code being returned, usually VPX_CODEC_ERROR.
+   *
+   * \note
+   * Currently this only works with VP9.
+   * @{
+   */
+
+  /*!\brief Pass in external frame buffers for the decoder to use.
+   *
+   * Registers functions to be called when libvpx needs a frame buffer
+   * to decode the current frame and a function to be called when libvpx does
+   * not internally reference the frame buffer. This set function must
+   * be called before the first call to decode or libvpx will assume the
+   * default behavior of allocating frame buffers internally.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] cb_get       Pointer to the get callback function
+   * \param[in] cb_release   Pointer to the release callback function
+   * \param[in] cb_priv      Callback's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     External frame buffers will be used by libvpx.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     One or more of the callbacks were NULL.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     using external frame buffers.
+   *
+   * \note
+   * When decoding VP9, the application may be required to pass in at least
+   * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+   * buffers.
+   */
+  vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+      vpx_codec_ctx_t *ctx,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+  /*!@} - end defgroup cap_external_frame_buffer */
+
   /*!@} - end defgroup decoder*/
 #ifdef __cplusplus
 }

diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index b5489b4..e69df4b 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h

@@ -11,6 +11,10 @@
 #ifndef VPX_VPX_FRAME_BUFFER_H_
 #define VPX_VPX_FRAME_BUFFER_H_
 
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -45,8 +49,9 @@
  * decoder needs a frame buffer to decode a compressed image into. This
  * function may be called more than once for every call to vpx_codec_decode.
  * The application may set fb->priv to some data which will be passed
- * back in the ximage and the release function call. On success the callback
- * must return 0. Any failure the callback must return a value less than 0.
+ * back in the ximage and the release function call. |fb| is guaranteed to
+ * not be NULL. On success the callback must return 0. Any failure the
+ * callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] new_size     Size in bytes needed by the buffer
@@ -58,8 +63,9 @@
 /*!\brief release frame buffer callback prototype
  *
  * This callback is invoked by the decoder when the frame buffer is not
- * referenced by any other buffers. On success the callback must return 0.
- * Any failure the callback must return a value less than 0.
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
  *
  * \param[in] priv         Callback's private data
  * \param[in] fb           Pointer to vpx_codec_frame_buffer_t

diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index d27325c..8d0f4ec 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h

@@ -28,7 +28,7 @@
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (1) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
@@ -139,6 +139,8 @@
     unsigned char *img_data;       /**< private */
     int      img_data_owner; /**< private */
     int      self_allocd;    /**< private */
+
+    void    *fb_priv; /**< Frame buffer data associated with the image. */
   } vpx_image_t; /**< alias for struct vpx_image */
 
   /**\brief Representation of a rectangle on a surface */

diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index e6cb52f..869a204 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk

@@ -19,7 +19,6 @@
 PORTS_SRCS-$(BUILD_LIBVPX) += emms.asm
 PORTS_SRCS-$(BUILD_LIBVPX) += x86.h
 PORTS_SRCS-$(BUILD_LIBVPX) += x86_abi_support.asm
-PORTS_SRCS-$(BUILD_LIBVPX) += x86_cpuid.c
 endif
 
 PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c

diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 603e2b6..bc99f89 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h

@@ -168,8 +168,6 @@
   return flags & mask;
 }
 
-vpx_cpu_t vpx_x86_vendor(void);
-
 #if ARCH_X86_64 && defined(_MSC_VER)
 unsigned __int64 __rdtsc(void);
 #pragma intrinsic(__rdtsc)

diff --git a/vpx_ports/x86_cpuid.c b/vpx_ports/x86_cpuid.c
deleted file mode 100644
index 02d382c..0000000
--- a/vpx_ports/x86_cpuid.c
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>
-#include "x86.h"
-
-struct cpuid_vendors {
-  char vendor_string[12];
-  vpx_cpu_t vendor_id;
-};
-
-static struct cpuid_vendors cpuid_vendor_list[VPX_CPU_LAST] = {
-  { "AuthenticAMD", VPX_CPU_AMD           },
-  { "AMDisbetter!", VPX_CPU_AMD_OLD       },
-  { "CentaurHauls", VPX_CPU_CENTAUR       },
-  { "CyrixInstead", VPX_CPU_CYRIX         },
-  { "GenuineIntel", VPX_CPU_INTEL         },
-  { "NexGenDriven", VPX_CPU_NEXGEN        },
-  { "Geode by NSC", VPX_CPU_NSC           },
-  { "RiseRiseRise", VPX_CPU_RISE          },
-  { "SiS SiS SiS ", VPX_CPU_SIS           },
-  { "GenuineTMx86", VPX_CPU_TRANSMETA     },
-  { "TransmetaCPU", VPX_CPU_TRANSMETA_OLD },
-  { "UMC UMC UMC ", VPX_CPU_UMC           },
-  { "VIA VIA VIA ", VPX_CPU_VIA           },
-};
-
-vpx_cpu_t vpx_x86_vendor(void) {
-  unsigned int reg_eax;
-  unsigned int vs[3];
-  int i;
-
-  /* Get the Vendor String from the CPU */
-  cpuid(0, 0, reg_eax, vs[0], vs[2], vs[1]);
-
-  for (i = 0; i < VPX_CPU_LAST; i++) {
-    if (strncmp((const char *)vs, cpuid_vendor_list[i].vendor_string, 12) == 0)
-      return (cpuid_vendor_list[i].vendor_id);
-  }
-
-  return VPX_CPU_UNKNOWN;
-}

diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index ab0a30a..5e95d31 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c

@@ -60,7 +60,7 @@
     const int frame_size = yplane_size + 2 * uvplane_size;
 
     if (!ybf->buffer_alloc) {
-      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
       ybf->buffer_alloc_sz = frame_size;
     }
 
@@ -180,12 +180,12 @@
       // removed if border is totally removed.
       vpx_memset(fb->data, 0, fb->size);
 
-      ybf->buffer_alloc = yv12_align_addr(fb->data, 32);
+      ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
     } else if (frame_size > ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
       if (ybf->buffer_alloc)
         vpx_free(ybf->buffer_alloc);
-      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
       if (!ybf->buffer_alloc)
         return -1;
 

diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 525f3a0..cdde75c 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h

@@ -24,62 +24,60 @@
 #define VP9_ENC_BORDER_IN_PIXELS    160
 #define VP9_DEC_BORDER_IN_PIXELS    32
 
-  typedef struct yv12_buffer_config {
-    int   y_width;
-    int   y_height;
-    int   y_crop_width;
-    int   y_crop_height;
-    int   y_stride;
-    /*    int   yinternal_width; */
+typedef struct yv12_buffer_config {
+  int   y_width;
+  int   y_height;
+  int   y_crop_width;
+  int   y_crop_height;
+  int   y_stride;
 
-    int   uv_width;
-    int   uv_height;
-    int   uv_crop_width;
-    int   uv_crop_height;
-    int   uv_stride;
-    /*    int   uvinternal_width; */
+  int   uv_width;
+  int   uv_height;
+  int   uv_crop_width;
+  int   uv_crop_height;
+  int   uv_stride;
 
-    int   alpha_width;
-    int   alpha_height;
-    int   alpha_stride;
+  int   alpha_width;
+  int   alpha_height;
+  int   alpha_stride;
 
-    uint8_t *y_buffer;
-    uint8_t *u_buffer;
-    uint8_t *v_buffer;
-    uint8_t *alpha_buffer;
+  uint8_t *y_buffer;
+  uint8_t *u_buffer;
+  uint8_t *v_buffer;
+  uint8_t *alpha_buffer;
 
-    uint8_t *buffer_alloc;
-    int buffer_alloc_sz;
-    int border;
-    int frame_size;
+  uint8_t *buffer_alloc;
+  int buffer_alloc_sz;
+  int border;
+  int frame_size;
 
-    int corrupted;
-    int flags;
-  } YV12_BUFFER_CONFIG;
+  int corrupted;
+  int flags;
+} YV12_BUFFER_CONFIG;
 
-  int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                int width, int height, int border);
+int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                   int width, int height, int border);
-  int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
-                                    int width, int height, int border);
-  int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
-  int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height, int ss_x, int ss_y,
+                           int border);
+
+// Updates the yv12 buffer config with the frame buffer. If cb is not
+// NULL, then libvpx is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libvpx will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libvpx will allocate memory
+// internally to decode the current frame. Returns 0 on success. Returns < 0
+// on failure.
+int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height, int ss_x, int ss_y,
-                             int border);
-
-  // Updates the yv12 buffer config with the frame buffer. If cb is not
-  // NULL, then libvpx is using the frame buffer callbacks to handle memory.
-  // If cb is not NULL, libvpx will call cb with minimum size in bytes needed
-  // to decode the current frame. If cb is NULL, libvpx will allocate memory
-  // internally to decode the current frame. Returns 0 on success. Returns < 0
-  // on failure.
-  int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
-                               int width, int height, int ss_x, int ss_y,
-                               int border,
-                               vpx_codec_frame_buffer_t *fb,
-                               vpx_get_frame_buffer_cb_fn_t cb,
-                               void *cb_priv);
-  int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+                             int border,
+                             vpx_codec_frame_buffer_t *fb,
+                             vpx_get_frame_buffer_cb_fn_t cb,
+                             void *cb_priv);
+int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
 }

diff --git a/vpxdec.c b/vpxdec.c
index e85c4fa..660f613 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -75,6 +75,8 @@
 static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
                                             "Scale output frames uniformly");
 
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
 
 static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
                                         "Compute the MD5 sum of the decoded frame");
@@ -82,7 +84,7 @@
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg, &scalearg,
+  &threadsarg, &verbosearg, &scalearg, &fb_arg,
   &md5arg,
   &error_concealment,
   NULL
@@ -302,6 +304,68 @@
           (float)frame_out * 1000000.0 / (float)dx_time);
 }
 
+struct ExternalFrameBuffer {
+  uint8_t* data;
+  size_t size;
+  int in_use;
+};
+
+struct ExternalFrameBufferList {
+  int num_external_frame_buffers;
+  struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  struct ExternalFrameBufferList *const ext_fb_list =
+      (struct ExternalFrameBufferList *)cb_priv;
+  if (ext_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+    if (!ext_fb_list->ext_fb[i].in_use)
+      break;
+  }
+
+  if (i == ext_fb_list->num_external_frame_buffers)
+    return -1;
+
+  if (ext_fb_list->ext_fb[i].size < min_size) {
+    free(ext_fb_list->ext_fb[i].data);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)malloc(min_size);
+    if (!ext_fb_list->ext_fb[i].data)
+      return -1;
+
+    ext_fb_list->ext_fb[i].size = min_size;
+  }
+
+  fb->data = ext_fb_list->ext_fb[i].data;
+  fb->size = ext_fb_list->ext_fb[i].size;
+  ext_fb_list->ext_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the external frame buffer.
+  fb->priv = &ext_fb_list->ext_fb[i];
+  return 0;
+}
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+int release_vp9_frame_buffer(void *cb_priv,
+                             vpx_codec_frame_buffer_t *fb) {
+  struct ExternalFrameBuffer *const ext_fb =
+      (struct ExternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  ext_fb->in_use = 0;
+  return 0;
+}
+
 void generate_filename(const char *pattern, char *out, size_t q_len,
                        unsigned int d_w, unsigned int d_h,
                        unsigned int frame_in) {
@@ -418,6 +482,7 @@
 int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
+  int                    i;
   uint8_t               *buf = NULL;
   size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
@@ -447,6 +512,8 @@
   int                     do_scale = 0;
   vpx_image_t             *scaled_img = NULL;
   int                     frame_avail, got_data;
+  int                     num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = {0};
 
   const char *outfile_pattern = NULL;
   char outfile_name[PATH_MAX] = {0};
@@ -505,6 +572,8 @@
       quiet = 0;
     else if (arg_match(&arg, &scalearg, argi))
       do_scale = 1;
+    else if (arg_match(&arg, &fb_arg, argi))
+      num_external_frame_buffers = arg_parse_uint(&arg);
 
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -691,6 +760,19 @@
     arg_skip--;
   }
 
+  if (num_external_frame_buffers > 0) {
+    ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+    ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+        num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (vpx_codec_set_frame_buffer_functions(
+            &decoder, get_vp9_frame_buffer, release_vp9_frame_buffer,
+            &ext_fb_list)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+
   frame_avail = 1;
   got_data = 0;
 
@@ -709,7 +791,8 @@
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
+                             NULL, 0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s",
                frame_in, vpx_codec_error(&decoder));
@@ -791,7 +874,7 @@
                                         vpx_input_ctx.height,
                                         &vpx_input_ctx.framerate, img->fmt);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, len);
+              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
             } else {
               fputs(buf, outfile);
             }
@@ -800,7 +883,7 @@
           // Y4M frame header
           len = y4m_write_frame_header(buf, sizeof(buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, len);
+            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
           } else {
             fputs(buf, outfile);
           }
@@ -863,6 +946,11 @@
 
   if (scaled_img) vpx_img_free(scaled_img);
 
+  for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+    free(ext_fb_list.ext_fb[i].data);
+  }
+  free(ext_fb_list.ext_fb);
+
   fclose(infile);
   free(argv);
 

diff --git a/vpxenc.c b/vpxenc.c
index 73b3144..02856d9 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -717,17 +717,6 @@
       argj++;
   }
 
-  /* Validate global config */
-  if (global->passes == 0) {
-#if CONFIG_VP9_ENCODER
-    // Make default VP9 passes = 2 until there is a better quality 1-pass
-    // encoder
-    global->passes = strcmp(global->codec->name, "vp9") == 0 ? 2 : 1;
-#else
-    global->passes = 1;
-#endif
-  }
-
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
     if (global->pass > global->passes) {
@@ -736,6 +725,23 @@
       global->passes = global->pass;
     }
   }
+  /* Validate global config */
+  if (global->passes == 0) {
+#if CONFIG_VP9_ENCODER
+    // Make default VP9 passes = 2 until there is a better quality 1-pass
+    // encoder
+    global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                      global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+#else
+    global->passes = 1;
+#endif
+  }
+
+  if (global->deadline == VPX_DL_REALTIME &&
+      global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
 }
 
 
@@ -826,6 +832,10 @@
 
     /* Allows removal of the application version from the EBML tags */
     stream->ebml.debug = global->debug;
+
+    /* Default lag_in_frames is 0 in realtime mode */
+    if (global->deadline == VPX_DL_REALTIME)
+      stream->config.cfg.g_lag_in_frames = 0;
   }
 
   /* Output files must be specified for each stream */
@@ -874,59 +884,63 @@
       continue;
     }
 
-    if (0);
-    else if (arg_match(&arg, &outputfile, argi))
+    if (0) {
+    } else if (arg_match(&arg, &outputfile, argi)) {
       config->out_fn = arg.val;
-    else if (arg_match(&arg, &fpf_name, argi))
+    } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-    else if (arg_match(&arg, &use_ivf, argi))
+    } else if (arg_match(&arg, &use_ivf, argi)) {
       config->write_webm = 0;
-    else if (arg_match(&arg, &threads, argi))
+    } else if (arg_match(&arg, &threads, argi)) {
       config->cfg.g_threads = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &profile, argi))
+    } else if (arg_match(&arg, &profile, argi)) {
       config->cfg.g_profile = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &width, argi))
+    } else if (arg_match(&arg, &width, argi)) {
       config->cfg.g_w = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &height, argi))
+    } else if (arg_match(&arg, &height, argi)) {
       config->cfg.g_h = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &stereo_mode, argi))
+    } else if (arg_match(&arg, &stereo_mode, argi)) {
       config->stereo_fmt = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &timebase, argi)) {
+    } else if (arg_match(&arg, &timebase, argi)) {
       config->cfg.g_timebase = arg_parse_rational(&arg);
       validate_positive_rational(arg.name, &config->cfg.g_timebase);
-    } else if (arg_match(&arg, &error_resilient, argi))
+    } else if (arg_match(&arg, &error_resilient, argi)) {
       config->cfg.g_error_resilient = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &lag_in_frames, argi))
+    } else if (arg_match(&arg, &lag_in_frames, argi)) {
       config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &dropframe_thresh, argi))
+      if (global->deadline == VPX_DL_REALTIME &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
+    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_allowed, argi))
+    } else if (arg_match(&arg, &resize_allowed, argi)) {
       config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_up_thresh, argi))
+    } else if (arg_match(&arg, &resize_up_thresh, argi)) {
       config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &resize_down_thresh, argi))
+    } else if (arg_match(&arg, &resize_down_thresh, argi)) {
       config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &end_usage, argi))
+    } else if (arg_match(&arg, &end_usage, argi)) {
       config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
-    else if (arg_match(&arg, &target_bitrate, argi))
+    } else if (arg_match(&arg, &target_bitrate, argi)) {
       config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &min_quantizer, argi))
+    } else if (arg_match(&arg, &min_quantizer, argi)) {
       config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &max_quantizer, argi))
+    } else if (arg_match(&arg, &max_quantizer, argi)) {
       config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &undershoot_pct, argi))
+    } else if (arg_match(&arg, &undershoot_pct, argi)) {
       config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &overshoot_pct, argi))
+    } else if (arg_match(&arg, &overshoot_pct, argi)) {
       config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_sz, argi))
+    } else if (arg_match(&arg, &buf_sz, argi)) {
       config->cfg.rc_buf_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_initial_sz, argi))
+    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
       config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &buf_optimal_sz, argi))
+    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
       config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &bias_pct, argi)) {
-      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
-
+    } else if (arg_match(&arg, &bias_pct, argi)) {
+        config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &minsection_pct, argi)) {
@@ -939,16 +953,15 @@
 
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
-    } else if (arg_match(&arg, &kf_min_dist, argi))
+    } else if (arg_match(&arg, &kf_min_dist, argi)) {
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
-    else if (arg_match(&arg, &kf_max_dist, argi)) {
+    } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
       config->have_kf_max_dist = 1;
-    } else if (arg_match(&arg, &kf_disabled, argi))
+    } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
-    else {
+    } else {
       int i, match = 0;
-
       for (i = 0; ctrl_args[i]; i++) {
         if (arg_match(&arg, ctrl_args[i], argi)) {
           int j;
@@ -972,12 +985,10 @@
 
         }
       }
-
       if (!match)
         argj++;
     }
   }
-
   return eos_mark_found;
 }