Merge "vpxenc: Warn on lagged encoding with real time."
diff --git a/ivfdec.c b/ivfdec.c
index a37a44c..c7f4a89 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -13,6 +13,25 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+static void fix_framerate(int *num, int *den) {
+  // Some versions of vpxenc used 1/(2*fps) for the timebase, so
+  // we can guess the framerate using only the timebase in this
+  // case. Other files would require reading ahead to guess the
+  // timebase, like we do for webm.
+  if (*num < 1000) {
+    // Correct for the factor of 2 applied to the timebase in the encoder.
+    if (*num & 1)
+      *den *= 2;
+    else
+      *num /= 2;
+  } else {
+    // Don't know FPS for sure, and don't have readahead code
+    // (yet?), so just default to 30fps.
+    *num = 30;
+    *den = 1;
+  }
+}
+
 int file_is_ivf(struct VpxInputContext *input_ctx) {
   char raw_hdr[32];
   int is_ivf = 0;
@@ -32,27 +51,8 @@
       input_ctx->height = mem_get_le16(raw_hdr + 14);
       input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
       input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
-
-      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
-       * we can guess the framerate using only the timebase in this
-       * case. Other files would require reading ahead to guess the
-       * timebase, like we do for webm.
-       */
-      if (input_ctx->framerate.numerator < 1000) {
-        /* Correct for the factor of 2 applied to the timebase in the
-         * encoder.
-         */
-        if (input_ctx->framerate.numerator & 1)
-          input_ctx->framerate.denominator <<= 1;
-        else
-          input_ctx->framerate.numerator >>= 1;
-      } else {
-        /* Don't know FPS for sure, and don't have readahead code
-         * (yet?), so just default to 30fps.
-         */
-        input_ctx->framerate.numerator = 30;
-        input_ctx->framerate.denominator = 1;
-      }
+      fix_framerate(&input_ctx->framerate.numerator,
+                    &input_ctx->framerate.denominator);
     }
   }
 
@@ -65,16 +65,10 @@
   return is_ivf;
 }
 
-int ivf_read_frame(struct VpxInputContext *input_ctx,
-                   uint8_t **buffer,
-                   size_t *bytes_read,
-                   size_t *buffer_size) {
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size) {
   char raw_header[IVF_FRAME_HDR_SZ] = {0};
   size_t frame_size = 0;
-  FILE *infile = input_ctx->file;
-
-  if (input_ctx->file_type != FILE_TYPE_IVF)
-    return 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
     if (!feof(infile))
diff --git a/ivfdec.h b/ivfdec.h
index 5da9acc..dd29cc6 100644
--- a/ivfdec.h
+++ b/ivfdec.h
@@ -18,10 +18,8 @@
 
 int file_is_ivf(struct VpxInputContext *input);
 
-int ivf_read_frame(struct VpxInputContext *input,
-                   uint8_t **buffer,
-                   size_t *bytes_read,
-                   size_t *buffer_size);
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size);
 
 #ifdef __cplusplus
 }  /* extern "C" */
diff --git a/ivfenc.c b/ivfenc.c
index fa92566..0041ff0 100644
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -20,9 +20,6 @@
                            int frame_cnt) {
   char header[32];
 
-  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-    return;
-
   header[0] = 'D';
   header[1] = 'K';
   header[2] = 'I';
@@ -44,9 +41,6 @@
   char header[12];
   vpx_codec_pts_t pts;
 
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
   pts = pkt->data.frame.pts;
   mem_put_le32(header, (int)pkt->data.frame.sz);
   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
diff --git a/tools_common.h b/tools_common.h
index 7500523..1d70ab5 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -63,10 +63,8 @@
 
 #define RAW_FRAME_HDR_SZ sizeof(uint32_t)
 
-#define VP8_FOURCC (0x30385056)
-#define VP9_FOURCC (0x30395056)
-#define VP8_FOURCC_MASK (0x00385056)
-#define VP9_FOURCC_MASK (0x00395056)
+#define VP8_FOURCC 0x30385056
+#define VP9_FOURCC 0x30395056
 
 enum VideoFileType {
   FILE_TYPE_RAW,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index b457604..45d7984 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -43,8 +43,8 @@
 
 
   typedef enum {
-    USAGE_STREAM_FROM_SERVER    = 0x0,
-    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+    USAGE_LOCAL_FILE_PLAYBACK   = 0x0,
+    USAGE_STREAM_FROM_SERVER    = 0x1,
     USAGE_CONSTRAINED_QUALITY   = 0x2,
     USAGE_CONSTANT_QUALITY      = 0x3,
   } END_USAGE;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 212a28a..a172ba6 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -13,13 +13,16 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_textblit.h"
 
 #define RGB_TO_YUV(t)                                            \
   ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
@@ -127,9 +130,6 @@
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
-
-/****************************************************************************
- */
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -371,7 +371,7 @@
   }
 }
 
-double vp9_gaussian(double sigma, double mu, double x) {
+static double gaussian(double sigma, double mu, double x) {
   return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
          (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
@@ -396,7 +396,7 @@
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
       if (a) {
         for (j = 0; j < a; j++) {
@@ -425,27 +425,6 @@
   state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start  starting address of buffer to
- *                                        add gaussian noise to
- *                  unsigned int width    width of plane
- *                  unsigned int height   height of plane
- *                  int  pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
 void vp9_plane_add_noise_c(uint8_t *start, char *noise,
                            char blackclamp[16],
                            char whiteclamp[16],
@@ -628,49 +607,40 @@
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  int q = cm->lf.filter_level * 10 / 6;
-  int flags = ppflags->post_proc_flag;
-  int deblock_level = ppflags->deblocking_level;
-  int noise_level = ppflags->noise_level;
+  const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+  const int flags = ppflags->post_proc_flag;
+  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+  struct postproc_state *const ppstate = &cm->postproc_state;
 
   if (!cm->frame_to_show)
     return -1;
 
-  if (q > 63)
-    q = 63;
-
   if (!flags) {
     *dest = *cm->frame_to_show;
     return 0;
   }
 
-#if ARCH_X86||ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
+  vp9_clear_system_state();
 
   if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
-                               q + (deblock_level - 5) * 10, 1, 0);
+    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, ppbuf, q);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   if (flags & VP9D_ADDNOISE) {
-    if (cm->postproc_state.last_q != q
-        || cm->postproc_state.last_noise != noise_level) {
-      fillrd(&cm->postproc_state, 63 - q, noise_level);
+    const int noise_level = ppflags->noise_level;
+    if (ppstate->last_q != q ||
+        ppstate->last_noise != noise_level) {
+      fillrd(ppstate, 63 - q, noise_level);
     }
 
-    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
-                        cm->postproc_state.noise,
-                        cm->postproc_state.blackclamp,
-                        cm->postproc_state.whiteclamp,
-                        cm->postproc_state.bothclamp,
-                        cm->post_proc_buffer.y_width,
-                        cm->post_proc_buffer.y_height,
-                        cm->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+                        ppstate->whiteclamp, ppstate->bothclamp,
+                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
 
 #if 0 && CONFIG_POSTPROC_VISUALIZER
@@ -684,16 +654,14 @@
              cm->filter_level,
              flags,
              cm->mb_cols, cm->mb_rows);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+    vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
   }
 
   if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
+    int mb_rows = ppbuf->y_height >> 4;
+    int mb_cols = ppbuf->y_width  >> 4;
     int mb_index = 0;
     MODE_INFO *mi = cm->mi;
 
@@ -719,9 +687,8 @@
   if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
+    int mb_rows = ppbuf->y_height >> 4;
+    int mb_cols = ppbuf->y_width  >> 4;
     int mb_index = 0;
     MODE_INFO *mi = cm->mi;
 
@@ -755,17 +722,15 @@
     snprintf(message, sizeof(message),
              "Bitrate: %10.2f framerate: %10.2f ",
              cm->bitrate, cm->framerate);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+    vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
   }
 
   /* Draw motion vectors */
   if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_buffer = ppbuf->y_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
     int x0, y0;
 
@@ -904,13 +869,12 @@
   if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
       && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_ptr = ppbuf->y_buffer;
+    uint8_t *u_ptr = ppbuf->u_buffer;
+    uint8_t *v_ptr = ppbuf->v_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
@@ -969,13 +933,12 @@
   if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
       ppflags->display_ref_frame_flag) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_ptr = ppbuf->y_buffer;
+    uint8_t *u_ptr = ppbuf->u_buffer;
+    uint8_t *v_ptr = ppbuf->v_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
@@ -1002,7 +965,7 @@
   }
 #endif
 
-  *dest = cm->post_proc_buffer;
+  *dest = *ppbuf;
 
   /* handle problem with extending borders */
   dest->y_width = cm->width;
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index c63beae..b8a456f 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -13,6 +13,7 @@
 #define VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_ppflags.h"
 
 struct postproc_state {
   int last_q;
@@ -23,8 +24,7 @@
   DECLARE_ALIGNED(16, char, bothclamp[16]);
 };
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_ppflags.h"
+struct VP9Common;
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index 6148206..7a790c5 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -39,17 +39,12 @@
 
 typedef const vp9_tree_index vp9_tree[];
 
-/* Convert array of token occurrence counts into a table of probabilities
-   for the associated binary encoding tree.  Also writes count of branches
-   taken for each node on the tree; this facilitiates decisions as to
-   probability updates. */
-
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
 
 // int64 is not needed for normal frame level calculations.
-// However when outputing entropy stats accumulated over many frames
+// However when outputting entropy stats accumulated over many frames
 // or even clips we can overflow int math.
 #ifdef ENTROPY_STATS
 static INLINE vp9_prob get_prob(int num, int den) {
@@ -65,7 +60,7 @@
   return get_prob(n0, n0 + n1);
 }
 
-/* this function assumes prob1 and prob2 are already within [1,255] range */
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
 static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f954236..f4f7582 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,19 +23,20 @@
   const short *filter
 );
 
-#if HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#if (HAVE_SSSE3)
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
 
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -45,6 +46,90 @@
   /* Ensure the filter can be compressed to int16_t. */
   if (x_step_q4 == 16 && filter_x[3] != 128) {
     while (w >= 16) {
+      vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+  }
+}
+
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
+  }
+}
+
+#else
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  /* Ensure the filter can be compressed to int16_t. */
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
       vp9_filter_block1d16_h8_ssse3(src, src_stride,
                                     dst, dst_stride,
                                     h, filter_x);
@@ -113,6 +198,7 @@
                          w, h);
   }
 }
+#endif
 
 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000..303fced
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,591 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, const unsigned char,
+filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, const unsigned char,
+filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
+
+DECLARE_ALIGNED(16, const unsigned char,
+filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
+
+
+
+void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+    __m128i addFilterReg64, filtersReg, srcReg, minReg;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits in the filter into the first lane
+    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+    // duplicate only the third 16 bit in the filter into the first lane
+    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+    // duplicate only the seconds 16 bits in the filter into the second lane
+    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+    // duplicate only the forth 16 bits in the filter into the second lane
+    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+    // loading the local filters
+    thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
+    forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+    for (i = 0; i < output_height; i++) {
+        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+        // filter the source buffer
+        srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+        // extract the higher half of the lane
+        srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+        srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+        minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+        // add and saturate all the results together
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+        srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+        // shift by 7 bit each 16 bits
+        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+        // shrink to 8 bit each 16 bits
+        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+        src_ptr+=src_pixels_per_line;
+
+        // save only 4 bytes
+        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+        output_ptr+=output_pitch;
+    }
+}
+
+
+void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pixels_per_line,
+                                         unsigned char *output_ptr,
+                                         unsigned int output_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+    __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+    __m128i addFilterReg64, filtersReg, minReg;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits (first and second byte)
+    // across 128 bit register
+    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+    // duplicate only the second 16 bits (third and forth byte)
+    // across 128 bit register
+    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+    // duplicate only the third 16 bits (fifth and sixth byte)
+    // across 128 bit register
+    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+    // duplicate only the forth 16 bits (seventh and eighth byte)
+    // across 128 bit register
+    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+    for (i = 0; i < output_height; i++) {
+        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+        // filter the source buffer
+        srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+        // filter the source buffer
+        srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+        srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+        // add and saturate all the results together
+        minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+
+        srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+        // shift by 7 bit each 16 bits
+        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+        // shrink to 8 bit each 16 bits
+        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+        src_ptr+=src_pixels_per_line;
+
+       // save only 8 bytes
+        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+        output_ptr+=output_pitch;
+    }
+}
+
+void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pixels_per_line,
+                                          unsigned char *output_ptr,
+                                          unsigned int output_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+    __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+    __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits (first and second byte)
+    // across 128 bit register
+    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+    // duplicate only the second 16 bits (third and forth byte)
+    // across 128 bit register
+    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+    // duplicate only the third 16 bits (fifth and sixth byte)
+    // across 128 bit register
+    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+    // duplicate only the forth 16 bits (seventh and eighth byte)
+    // across 128 bit register
+    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+    for (i = 0; i < output_height; i++) {
+        srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+
+        // filter the source buffer
+        srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+        // add and saturate the results together
+        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+        // filter the source buffer
+        srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+        // add and saturate the results together
+        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+        // reading the next 16 bytes.
+        // (part of it was being read by earlier read)
+        srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+
+        // add and saturate the results together
+        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+        // filter the source buffer
+        srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+        // add and saturate the results together
+        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+        // filter the source buffer
+        srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
+        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+        // add and saturate the results together
+        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+
+        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+        // shift by 7 bit each 16 bit
+        srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+        srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+        // shrink to 8 bit each 16 bits, the first lane contain the first
+        // convolve result and the second lane contain the second convolve
+        // result
+        srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+        src_ptr+=src_pixels_per_line;
+
+        // save 16 bytes
+        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+        output_ptr+=output_pitch;
+    }
+}
+
+
+
+void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+    __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
+    __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits in the filter into the first lane
+    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+    // duplicate only the second 16 bits in the filter into the second lane
+    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+    // duplicate only the third 16 bits in the filter into the first lane
+    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+    // duplicate only the forth 16 bits in the filter into the second lane
+    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+    for (i = 0; i < output_height; i++) {
+        // load the first 4 byte
+        srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
+        // load the next 4 bytes in stride of src_pitch
+        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
+
+        // merge the result together
+        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+
+
+        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
+        srcRegFilt3 =  _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
+
+        // merge the result together
+        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+
+        srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
+        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
+
+        // merge the result together
+        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+        srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
+
+        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
+        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
+
+        // merge the result together
+        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
+        srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+
+        // extract the second lane of the 128 bit register
+        srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
+
+        // add and saturate the results together
+        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+        _mm_srli_si128(srcRegFilt3, 8));
+        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+        // shift by 7 bit each 16 bit
+        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+        // shrink to 8 bit each 16 bits
+        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+        src_ptr+=src_pitch;
+
+        // save only 4 bytes convolve result
+        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+        output_ptr+=out_pitch;
+    }
+}
+
+void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
+                                         unsigned int src_pitch,
+                                         unsigned char *output_ptr,
+                                         unsigned int out_pitch,
+                                         unsigned int output_height,
+                                         int16_t *filter) {
+    __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits in the filter
+    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+    // duplicate only the second 16 bits in the filter
+    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+    // duplicate only the third 16 bits in the filter
+    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+    // duplicate only the forth 16 bits in the filter
+    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+    for (i = 0; i < output_height; i++) {
+        // load the first 8 bytes
+        srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+        // load the next 8 bytes in stride of src_pitch
+        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
+        srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
+        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+
+        // merge the result together
+        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+
+        // load the next 8 bytes in stride of src_pitch
+        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
+        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
+        srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
+        srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+
+        // merge the result together
+        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
+        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+        // add and saturate the results together
+        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+        // shift by 7 bit each 16 bit
+        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+        // shrink to 8 bit each 16 bits
+        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+        src_ptr+=src_pitch;
+
+        // save only 8 bytes convolve result
+        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+        output_ptr+=out_pitch;
+    }
+}
+
+
+void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
+                                          unsigned int src_pitch,
+                                          unsigned char *output_ptr,
+                                          unsigned int out_pitch,
+                                          unsigned int output_height,
+                                          int16_t *filter) {
+    __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+    __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    unsigned int i;
+
+    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+    filtersReg = _mm_loadu_si128((__m128i *)filter);
+    // converting the 16 bit (short) to  8 bit (byte) and have the same data
+    // in both lanes of 128 bit register.
+    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+    // duplicate only the first 16 bits in the filter
+    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+    // duplicate only the second 16 bits in the filter
+    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+    // duplicate only the third 16 bits in the filter
+    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+    // duplicate only the forth 16 bits in the filter
+    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+
+    for (i = 0; i < output_height; i++) {
+        // load the first 16 bytes
+        srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
+        // load the next 16 bytes in stride of src_pitch
+        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
+        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
+        srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+
+        // merge the result together
+        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
+        srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
+        srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
+        srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+
+        // add and saturate the results together
+        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+        // load the next 16 bytes in stride of two/three src_pitch
+        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
+        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
+
+        // merge the result together
+        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+        srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+        // load the next 16 bytes in stride of four/five src_pitch
+        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
+        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
+
+        // merge the result together
+        srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
+        srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+
+        // multiply 2 adjacent elements with the filter and add the result
+        srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+        srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+
+        // add and saturate the results together
+        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+        _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+        _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+
+        // add and saturate the results together
+        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+        _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+        _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+        // shift by 7 bit each 16 bit
+        srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+        // shrink to 8 bit each 16 bits, the first lane contain the first
+        // convolve result and the second lane contain the second convolve
+        // result
+        srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+        src_ptr+=src_pitch;
+
+        // save 16 bytes convolve result
+        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+        output_ptr+=out_pitch;
+    }
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 69c569d..ec4dc14 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -39,11 +39,7 @@
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[INTRA_MODES]
-                    [INTRA_MODES]
-                    [INTRA_MODES];
 vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
-
 extern unsigned int active_section;
 #endif
 
@@ -414,9 +410,6 @@
         const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
         const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
         const int bm = m->bmi[i].as_mode;
-#ifdef ENTROPY_STATS
-        ++intra_mode_stats[A][L][bm];
-#endif
         write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
       }
     }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4c66c20..b0fae65 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2543,6 +2543,7 @@
   }
 
   if (!is_inter_block(mbmi)) {
+    mbmi->skip_coeff = 1;
     vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
     vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
     if (output_enabled)
@@ -2561,6 +2562,7 @@
   if (!is_inter_block(mbmi)) {
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else if (!x->skip) {
+    mbmi->skip_coeff = 1;
     vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 21bc588..4bef675 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -451,6 +451,9 @@
     ctx->tl[plane][j] = p->eobs[block] > 0;
   }
 
+  if (p->eobs[block])
+    *(args->skip_coeff) = 0;
+
   if (x->skip_encode || p->eobs[block] == 0)
     return;
 
@@ -474,7 +477,6 @@
       assert(0 && "Invalid transform size");
   }
 }
-
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
@@ -499,7 +501,8 @@
 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -511,7 +514,8 @@
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   if (!x->skip_recode)
     vp9_subtract_sb(x, bsize);
@@ -655,12 +659,15 @@
     default:
       assert(0);
   }
+  if (*eob)
+    *(args->skip_coeff) = 0;
 }
 
 void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
                                      &arg);
@@ -668,7 +675,8 @@
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
   foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
 }
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index cb872a7..207d573 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,6 +24,7 @@
 struct encode_b_args {
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
+  unsigned char *skip_coeff;
 };
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9dfb442..538599d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -49,9 +49,6 @@
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
-
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
   *a = *b;
@@ -269,20 +266,15 @@
 // harder frames.
 static double calculate_modified_err(VP9_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  struct twopass_rc *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
-  const double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_error;
+  double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
+                                           DOUBLE_DIVIDE_CHECK(av_err),
+                                       cpi->oxcf.two_pass_vbrbias / 100.0);
 
-  modified_error =  av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
-                                 this_err > av_err ? POW1 : POW2);
-
-  if (modified_error < cpi->twopass.modified_error_min)
-    modified_error = cpi->twopass.modified_error_min;
-  else if (modified_error > cpi->twopass.modified_error_max)
-    modified_error = cpi->twopass.modified_error_max;
-
-  return modified_error;
+  return fclamp(modified_error,
+                twopass->modified_error_min, twopass->modified_error_max);
 }
 
 static const double weight_table[256] = {
@@ -353,13 +345,14 @@
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(VP9_COMP *cpi) {
   int64_t max_bits =
-     ((int64_t)cpi->rc.av_per_frame_bandwidth *
-      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+    ((int64_t)cpi->rc.av_per_frame_bandwidth *
+     (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
 
   if (max_bits < 0)
-    return 0;
-  if (max_bits >= INT_MAX)
-    return INT_MAX;
+    max_bits = 0;
+  else if (max_bits > cpi->rc.max_frame_bandwidth)
+    max_bits = cpi->rc.max_frame_bandwidth;
+
   return (int)max_bits;
 }
 
@@ -953,13 +946,13 @@
   int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
+  RATE_CONTROL *const rc = &cpi->rc;
 
-  double section_err = fpstats->coded_error / fpstats->count;
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
+  const double section_err = fpstats->coded_error / fpstats->count;
+  const double err_per_mb = section_err / num_mbs;
 
   if (section_target_bandwitdh <= 0)
-    return cpi->rc.worst_quality;          // Highest value allowed
+    return rc->worst_quality;          // Highest value allowed
 
   target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
                               ? (512 * section_target_bandwitdh) / num_mbs
@@ -967,15 +960,11 @@
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = cpi->rc.best_quality; q < cpi->rc.worst_quality; q++) {
-    int bits_per_mb_at_this_q;
-
-    err_correction_factor = calc_correction_factor(err_per_mb,
-                                                   ERR_DIVISOR, 0.5, 0.90, q);
-
-    bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                               err_correction_factor);
-
+  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+    const double err_correction_factor = calc_correction_factor(err_per_mb,
+                                             ERR_DIVISOR, 0.5, 0.90, q);
+    const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                                         err_correction_factor);
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
@@ -1169,8 +1158,7 @@
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
-      zz_inter =
-        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
+      zz_inter = (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
       if (zz_inter < 0.999)
         break;
     }
@@ -1540,6 +1528,7 @@
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  RATE_CONTROL *const rc = &cpi->rc;
 
   cpi->twopass.gf_group_bits = 0;
 
@@ -1556,7 +1545,7 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // The error score / cost of this frame has already been accounted for.
-  if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
   // Motion breakout threshold for loop below depends on image size.
@@ -1570,14 +1559,14 @@
   // interval to spread the cost of the GF.
   //
   active_max_gf_interval =
-    12 + ((int)vp9_convert_qindex_to_q(cpi->rc.last_q[INTER_FRAME]) >> 5);
+    12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
 
-  if (active_max_gf_interval > cpi->rc.max_gf_interval)
-    active_max_gf_interval = cpi->rc.max_gf_interval;
+  if (active_max_gf_interval > rc->max_gf_interval)
+    active_max_gf_interval = rc->max_gf_interval;
 
   i = 0;
   while ((i < cpi->twopass.static_scene_max_gf_interval) &&
-         (i < cpi->rc.frames_to_key)) {
+         (i < rc->frames_to_key)) {
     i++;    // Increment the loop counter
 
     // Accumulate error score of frames in this gf group
@@ -1620,8 +1609,7 @@
     }
 
     // Calculate a boost number for this frame
-    boost_score +=
-      (decay_accumulator *
+    boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
@@ -1649,14 +1637,14 @@
   cpi->twopass.gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
   // Don't allow a gf too near the next kf
-  if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
+  if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
+    while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
       i++;
 
       if (EOF == input_stats(&cpi->twopass, this_frame))
         break;
 
-      if (i < cpi->rc.frames_to_key) {
+      if (i < rc->frames_to_key) {
         mod_frame_err = calculate_modified_err(cpi, this_frame);
         gf_group_err += mod_frame_err;
       }
@@ -1676,18 +1664,18 @@
 #endif
 
   // Set the interval until the next gf.
-  if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
-    cpi->rc.baseline_gf_interval = i - 1;
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+    rc->baseline_gf_interval = i - 1;
   else
-    cpi->rc.baseline_gf_interval = i;
+    rc->baseline_gf_interval = i;
 
   // Should we use the alternate reference frame
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
       // for real scene cuts (not forced kfs) dont allow arf very near kf.
-      (cpi->rc.next_key_frame_forced ||
-        (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
+      (rc->next_key_frame_forced ||
+        (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
       ((next_frame.pcnt_inter > 0.75) ||
        (next_frame.pcnt_second_ref > 0.5)) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1695,25 +1683,25 @@
       (boost_score > 100)) {
 
     // Alternative boost calculation for alt ref
-    cpi->rc.gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
-                                    &b_boost);
-    cpi->rc.source_alt_ref_pending = 1;
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+                                   &b_boost);
+    rc->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
     // Set the ARF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
+      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
     }
 #endif
   } else {
-    cpi->rc.gfu_boost = (int)boost_score;
-    cpi->rc.source_alt_ref_pending = 0;
+    rc->gfu_boost = (int)boost_score;
+    rc->source_alt_ref_pending = 0;
 #if CONFIG_MULTIPLE_ARF
     // Set the GF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, cpi->rc.baseline_gf_interval - 1, 2, 0, 0);
+      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
       assert(cpi->new_frame_coding_order_period ==
-             cpi->rc.baseline_gf_interval);
+             rc->baseline_gf_interval);
     }
 #endif
   }
@@ -1773,32 +1761,28 @@
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
   // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits >
-      (int64_t)max_bits * cpi->rc.baseline_gf_interval)
-    cpi->twopass.gf_group_bits =
-        (int64_t)max_bits * cpi->rc.baseline_gf_interval;
+  if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
   // Reset the file position
   reset_fpf_position(&cpi->twopass, start_pos);
 
   // Assign  bits to the arf or gf.
-  for (i = 0;
-      i <= (cpi->rc.source_alt_ref_pending &&
-            cpi->common.frame_type != KEY_FRAME);
-      ++i) {
+  for (i = 0; i <= (rc->source_alt_ref_pending &&
+                    cpi->common.frame_type != KEY_FRAME); ++i) {
     int allocation_chunks;
-    int q = cpi->rc.last_q[INTER_FRAME];
+    int q = rc->last_q[INTER_FRAME];
     int gf_bits;
 
-    int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
+    int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
+    boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
-    if (cpi->rc.source_alt_ref_pending && i == 0)
-      allocation_chunks = ((cpi->rc.baseline_gf_interval + 1) * 100) + boost;
+    if (rc->source_alt_ref_pending && i == 0)
+      allocation_chunks = ((rc->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks = (cpi->rc.baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1815,11 +1799,10 @@
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
-    if (cpi->rc.baseline_gf_interval < 1 ||
-        mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
-      double alt_gf_grp_bits =
-        (double)cpi->twopass.kf_group_bits  *
-        (mod_frame_err * (double)cpi->rc.baseline_gf_interval) /
+    if (rc->baseline_gf_interval < 1 ||
+        mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
+      double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits  *
+        (mod_frame_err * (double)rc->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
       int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
@@ -1846,10 +1829,11 @@
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
     }
-    if (i == 1 || (!cpi->rc.source_alt_ref_pending
-        && (cpi->common.frame_type != KEY_FRAME))) {
+    if (i == 1 ||
+        (!rc->source_alt_ref_pending &&
+         (cpi->common.frame_type != KEY_FRAME))) {
       // Per frame bit target for this frame
-      cpi->rc.per_frame_bandwidth = gf_bits;
+      rc->per_frame_bandwidth = gf_bits;
     }
   }
 
@@ -1867,7 +1851,7 @@
     // the remaining bits amoung the other frames/
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
-    if (cpi->rc.source_alt_ref_pending) {
+    if (rc->source_alt_ref_pending) {
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
     } else if (cpi->common.frame_type != KEY_FRAME) {
       cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
@@ -1884,9 +1868,8 @@
     // This condition could fail if there are two kfs very close together
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
-    if (cpi->rc.baseline_gf_interval >= 3) {
-      const int boost = cpi->rc.source_alt_ref_pending ?
-          b_boost : cpi->rc.gfu_boost;
+    if (rc->baseline_gf_interval >= 3) {
+      const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
 
       if (boost >= 150) {
         int alt_extra_bits;
@@ -1905,7 +1888,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(&cpi->twopass, start_pos);
 
-    for (i = 0; i < cpi->rc.baseline_gf_interval; i++) {
+    for (i = 0; i < rc->baseline_gf_interval; i++) {
       input_stats(&cpi->twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -2028,6 +2011,7 @@
 
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
+  RATE_CONTROL *rc = &cpi->rc;
 
   double this_frame_intra_error;
   double this_frame_coded_error;
@@ -2042,7 +2026,7 @@
   vp9_clear_system_state();
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    cpi->rc.active_worst_quality = cpi->oxcf.cq_level;
+    rc->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cpi->common.current_video_frame == 0) {
     // Special case code for first frame.
     int section_target_bandwidth =
@@ -2051,9 +2035,9 @@
     tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
                            section_target_bandwidth);
 
-    cpi->rc.active_worst_quality = tmp_q;
-    cpi->rc.ni_av_qi = tmp_q;
-    cpi->rc.avg_q = vp9_convert_qindex_to_q(tmp_q);
+    rc->active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
     // Limit the maxq value returned subsequently.
     // This increases the risk of overspend or underspend if the initial
@@ -2070,7 +2054,7 @@
   this_frame_coded_error = this_frame.coded_error;
 
   // keyframe and section processing !
-  if (cpi->rc.frames_to_key == 0) {
+  if (rc->frames_to_key == 0) {
     // Define next KF group and assign bits to it
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
@@ -2079,7 +2063,7 @@
   }
 
   // Is this a GF / ARF (Note that a KF is always also a GF)
-  if (cpi->rc.frames_till_gf_update_due == 0) {
+  if (rc->frames_till_gf_update_due == 0) {
     // Define next gf group and assign bits to it
     this_frame_copy = this_frame;
 
@@ -2102,7 +2086,7 @@
         cpi->enable_encode_breakout = 2;
     }
 
-    cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
   } else {
     // Otherwise this is an ordinary frame
@@ -2123,8 +2107,8 @@
   }
 
   // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = (int)(cpi->rc.per_frame_bandwidth
-                                * cpi->output_framerate);
+  cpi->target_bandwidth = (int)(rc->per_frame_bandwidth *
+                                   cpi->output_framerate);
   if (cpi->target_bandwidth < 0)
     cpi->target_bandwidth = 0;
 
@@ -2179,10 +2163,9 @@
 
       // Cumulative effect of decay in prediction quality
       if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+        decay_accumulator *= local_next_frame.pcnt_inter;
       else
-        decay_accumulator =
-            decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
       // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
 
@@ -2241,6 +2224,8 @@
   double kf_group_coded_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
+  RATE_CONTROL *const rc = &cpi->rc;
+
   vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
@@ -2249,15 +2234,15 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // is this a forced key frame by interval
-  cpi->rc.this_key_frame_forced = cpi->rc.next_key_frame_forced;
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
 
   // Clear the alt ref active flag as this can never be active on a key frame
-  cpi->rc.source_alt_ref_active = 0;
+  rc->source_alt_ref_active = 0;
 
   // Kf is always a gf so clear frames till next gf counter
-  cpi->rc.frames_till_gf_update_due = 0;
+  rc->frames_till_gf_update_due = 0;
 
-  cpi->rc.frames_to_key = 1;
+  rc->frames_to_key = 1;
 
   // Take a copy of the initial frame details
   first_frame = *this_frame;
@@ -2309,14 +2294,14 @@
         break;
 
       // Step on to the next frame
-      cpi->rc.frames_to_key++;
+      rc->frames_to_key++;
 
       // If we don't have a real key frame within the next two
       // forcekeyframeevery intervals then break out of the loop.
-      if (cpi->rc.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+      if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
     } else {
-      cpi->rc.frames_to_key++;
+      rc->frames_to_key++;
     }
     i++;
   }
@@ -2325,11 +2310,11 @@
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural
   // interval is between 1x and 2x
-  if (cpi->oxcf.auto_key
-      && cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
+  if (cpi->oxcf.auto_key &&
+      rc->frames_to_key > (int)cpi->key_frame_frequency) {
     FIRSTPASS_STATS tmp_frame;
 
-    cpi->rc.frames_to_key /= 2;
+    rc->frames_to_key /= 2;
 
     // Copy first frame details
     tmp_frame = first_frame;
@@ -2342,7 +2327,7 @@
     kf_group_coded_err = 0;
 
     // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < cpi->rc.frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; i++) {
       // Accumulate kf group errors
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
       kf_group_intra_err += tmp_frame.intra_error;
@@ -2351,11 +2336,11 @@
       // Load a the next frame's stats
       input_stats(&cpi->twopass, &tmp_frame);
     }
-    cpi->rc.next_key_frame_forced = 1;
+    rc->next_key_frame_forced = 1;
   } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
-    cpi->rc.next_key_frame_forced = 1;
+    rc->next_key_frame_forced = 1;
   } else {
-    cpi->rc.next_key_frame_forced = 0;
+    rc->next_key_frame_forced = 0;
   }
 
   // Special case for the last key frame of the file
@@ -2386,7 +2371,7 @@
                                             cpi->twopass.modified_error_left));
 
     // Clip based on maximum per frame rate defined by the user.
-    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->rc.frames_to_key;
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
     if (cpi->twopass.kf_group_bits > max_grp_bits)
       cpi->twopass.kf_group_bits = max_grp_bits;
   } else {
@@ -2402,7 +2387,7 @@
   loop_decay_rate = 1.00;       // Starting decay rate
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < cpi->rc.frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; i++) {
     double r;
 
     if (EOF == input_stats(&cpi->twopass, &next_frame))
@@ -2416,7 +2401,7 @@
     }
 
     // For the first few frames collect data to decide kf boost.
-    if (i <= (cpi->rc.max_gf_interval * 2)) {
+    if (i <= (rc->max_gf_interval * 2)) {
       if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
         r = (IIKFACTOR2 * next_frame.intra_error /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
@@ -2445,16 +2430,15 @@
     zero_stats(&sectionstats);
     reset_fpf_position(&cpi->twopass, start_position);
 
-    for (i = 0; i < cpi->rc.frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; i++) {
       input_stats(&cpi->twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
 
     avg_stats(&sectionstats);
 
-    cpi->twopass.section_intra_rating = (int)
-      (sectionstats.intra_error
-      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+    cpi->twopass.section_intra_rating = (int) (sectionstats.intra_error /
+        DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
   // Reset the first pass file position
@@ -2466,15 +2450,15 @@
     int allocation_chunks;
     int alt_kf_bits;
 
-    if (kf_boost < (cpi->rc.frames_to_key * 3))
-      kf_boost = (cpi->rc.frames_to_key * 3);
+    if (kf_boost < (rc->frames_to_key * 3))
+      kf_boost = (rc->frames_to_key * 3);
 
     if (kf_boost < 300)  // Min KF boost
       kf_boost = 300;
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
-    cpi->rc.kf_boost = kf_boost;
+    rc->kf_boost = kf_boost;
     cpi->twopass.kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
     // We do three calculations for kf size.
@@ -2491,10 +2475,10 @@
     // care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
       allocation_chunks =
-        ((cpi->rc.frames_to_key - 1) * 10) + kf_boost;
+        ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
       allocation_chunks =
-        ((cpi->rc.frames_to_key - 1) * 100) + kf_boost;
+        ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
     // Prevent overflow
@@ -2504,22 +2488,21 @@
       allocation_chunks /= divisor;
     }
 
-    cpi->twopass.kf_group_bits =
-        (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0
+           : cpi->twopass.kf_group_bits;
 
     // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits =
-        (int)((double)kf_boost *
+    cpi->twopass.kf_bits = (int)((double)kf_boost *
               ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
 
     // If the key frame is actually easier than the average for the
     // kf group (which does sometimes happen... eg a blank intro frame)
     // Then use an alternate calculation based on the kf error score
     // which should give a smaller key frame.
-    if (kf_mod_err < kf_group_err / cpi->rc.frames_to_key) {
+    if (kf_mod_err < kf_group_err / rc->frames_to_key) {
       double  alt_kf_grp_bits =
         ((double)cpi->twopass.bits_left *
-         (kf_mod_err * (double)cpi->rc.frames_to_key) /
+         (kf_mod_err * (double)rc->frames_to_key) /
          DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
 
       alt_kf_bits = (int)((double)kf_boost *
@@ -2532,8 +2515,7 @@
     // Else if it is much harder than other frames in the group make sure
     // it at least receives an allocation in keeping with its relative
     // error score
-      alt_kf_bits =
-        (int)((double)cpi->twopass.bits_left *
+      alt_kf_bits = (int)((double)cpi->twopass.bits_left *
               (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
 
@@ -2545,7 +2527,7 @@
     cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
 
     // Peer frame bit target for this frame
-    cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
+    rc->per_frame_bandwidth = cpi->twopass.kf_bits;
     // Convert to a per second bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
                                   cpi->output_framerate);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index abdcf2f..2c7c86e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -59,6 +59,11 @@
 #define DISABLE_COMPOUND_SPLIT    0x18
 #define LAST_AND_INTRA_SPLIT_ONLY 0x1E
 
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
 #if CONFIG_INTERNAL_STATS
 extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest, int lumamask,
@@ -88,12 +93,6 @@
 #endif
 
 
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[INTRA_MODES]
-                           [INTRA_MODES]
-                           [INTRA_MODES];
-#endif
-
 #ifdef MODE_STATS
 extern void init_tx_count_stats();
 extern void write_tx_count_stats();
@@ -1093,6 +1092,9 @@
 };
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int64_t vbr_max_bits;
+
   if (framerate < 0.1)
     framerate = 30;
 
@@ -1109,6 +1111,19 @@
   cpi->rc.min_frame_bandwidth = MAX(cpi->rc.min_frame_bandwidth,
                                     FRAME_OVERHEAD_BITS);
 
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  //
+  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
+                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  cpi->rc.max_frame_bandwidth =
+    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
 
@@ -1957,41 +1972,6 @@
     }
 #endif
 
-#ifdef ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("vp9_modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
-                     "[INTRA_MODES] =\n{\n");
-
-      for (i = 0; i < INTRA_MODES; i++) {
-        fprintf(fmode, "    { // Above Mode :  %d\n", i);
-
-        for (j = 0; j < INTRA_MODES; j++) {
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < INTRA_MODES; k++) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, // left_mode %d\n", j);
-        }
-
-        fprintf(fmode, "    },\n");
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
-
 #if defined(SECTIONBITS_OUTPUT)
 
     if (0) {
@@ -2443,10 +2423,14 @@
   int force_recode = 0;
   VP9_COMMON *cm = &cpi->common;
 
-  // Is frame recode allowed at all
-  // Yes if either recode mode 1 is selected or mode two is selected
-  // and the frame is a key frame. golden frame or alt_ref_frame
-  if ((cpi->sf.recode_loop == 1) ||
+  // Special case trap if maximum allowed frame size exceeded.
+  if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) {
+    force_recode = 1;
+
+  // Is frame recode allowed.
+  // Yes if either recode mode 1 is selected or mode 2 is selected
+  // and the frame is a key frame, golden frame or alt_ref_frame
+  } else if ((cpi->sf.recode_loop == 1) ||
       ((cpi->sf.recode_loop == 2) &&
        ((cm->frame_type == KEY_FRAME) ||
         cpi->refresh_golden_frame ||
@@ -2624,7 +2608,8 @@
         "%6d %6d %5d %5d %5d %10d %10.3f"
         "%10.3f %8d %10d %10d %10d\n",
         cpi->common.current_video_frame, cpi->rc.this_frame_target,
-        cpi->rc.projected_frame_size, 0,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
         (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
         (int)cpi->rc.total_target_vs_actual,
         (int)(cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
@@ -2734,8 +2719,9 @@
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
     } else {
-      // Special case handling for forced key frames
-      if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
+      if ((cm->frame_type == KEY_FRAME) &&
+           cpi->rc.this_key_frame_forced &&
+           (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
         int last_q = *q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2774,7 +2760,7 @@
         loop = *q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          *q, top_index, bottom_index)) {
+          *q, MAX(q_high, top_index), bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
         int last_q = *q;
@@ -2785,6 +2771,10 @@
 
         // Frame is too large
         if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
+            q_high = cpi->rc.worst_quality;
+
           // Raise Qlow as to at least the current value
           q_low = *q < q_high ? *q + 1 : q_high;
 
@@ -2798,12 +2788,12 @@
             vp9_rc_update_rate_correction_factors(cpi, 0);
 
             *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
-                                   bottom_index, top_index);
+                                   bottom_index, MAX(q_high, top_index));
 
             while (*q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
               *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
-                                     bottom_index, top_index);
+                                     bottom_index, MAX(q_high, top_index));
               retries++;
             }
           }
@@ -2849,7 +2839,9 @@
       }
     }
 
-    if (cpi->rc.is_src_frame_alt_ref)
+    // Special case for overlay frame.
+    if (cpi->rc.is_src_frame_alt_ref &&
+        (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
       loop = 0;
 
     if (loop) {
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b1969f3..a665bf8 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -442,9 +442,10 @@
   unsigned int source_alt_ref_active;
   unsigned int is_src_frame_alt_ref;
 
-  int per_frame_bandwidth;  // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;  // Average frame size target for clip
-  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int per_frame_bandwidth;        // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;     // Average frame size target for clip
+  int min_frame_bandwidth;        // Minimum allocation used for any frame
+  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
 
   int ni_av_qi;
   int ni_tot_qi;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 58078ad..7a5282d 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -32,44 +32,6 @@
              stride * (lines_to_copy + 16));
 }
 
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
-                                YV12_BUFFER_CONFIG *dest, int Fraction) {
-  int i, j;
-  int Total = 0;
-  int srcoffset, dstoffset;
-  uint8_t *src = source->y_buffer;
-  uint8_t *dst = dest->y_buffer;
-
-  int linestocopy = (source->y_height >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-
-  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
-  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
-
-  src += srcoffset;
-  dst += dstoffset;
-
-  // Loop through the raw Y plane and reconstruction data summing the square
-  // differences.
-  for (i = 0; i < linestocopy; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
-
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
-  }
-
-  return Total;
-}
-
 // Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
   int min_filter_level;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index aefef53..72ab00f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -258,25 +258,27 @@
 // Update the buffer level: leaky bucket model.
 void vp9_update_buffer_level(VP9_COMP *const cpi, int encoded_frame_size) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame) {
-    cpi->rc.bits_off_target -= encoded_frame_size;
+    rc->bits_off_target -= encoded_frame_size;
   } else {
-    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
-        encoded_frame_size;
+    rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size;
   }
   // Clip the buffer level to the maximum specified buffer size.
-  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
-    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+  if (rc->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+    rc->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
-  cpi->rc.buffer_level = cpi->rc.bits_off_target;
+  rc->buffer_level = rc->bits_off_target;
 }
 
 int vp9_drop_frame(VP9_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
   if (!cpi->oxcf.drop_frames_water_mark) {
     return 0;
   } else {
-    if (cpi->rc.buffer_level < 0) {
+    if (rc->buffer_level < 0) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
@@ -284,23 +286,23 @@
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
           cpi->oxcf.optimal_buffer_level / 100);
-      if ((cpi->rc.buffer_level > drop_mark) &&
-          (cpi->rc.decimation_factor > 0)) {
-        --cpi->rc.decimation_factor;
-      } else if (cpi->rc.buffer_level <= drop_mark &&
-          cpi->rc.decimation_factor == 0) {
-        cpi->rc.decimation_factor = 1;
+      if ((rc->buffer_level > drop_mark) &&
+          (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark &&
+          rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
       }
-      if (cpi->rc.decimation_factor > 0) {
-        if (cpi->rc.decimation_count > 0) {
-          --cpi->rc.decimation_count;
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
           return 1;
         } else {
-          cpi->rc.decimation_count = cpi->rc.decimation_factor;
+          rc->decimation_count = rc->decimation_factor;
           return 0;
         }
       } else {
-        cpi->rc.decimation_count = 0;
+        rc->decimation_count = 0;
         return 0;
       }
     }
@@ -314,63 +316,65 @@
   // If buffer is below the optimal level, let the active_worst_quality go from
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
-  int active_worst_quality = cpi->rc.active_worst_quality;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int active_worst_quality = rc->active_worst_quality;
   // Maximum limit for down adjustment, ~20%.
   int max_adjustment_down = active_worst_quality / 5;
   // Buffer level below which we push active_worst to worst_quality.
-  int critical_level = cpi->oxcf.optimal_buffer_level >> 2;
+  int critical_level = oxcf->optimal_buffer_level >> 2;
   int adjustment = 0;
   int buff_lvl_step = 0;
-  if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
+  if (rc->buffer_level > oxcf->optimal_buffer_level) {
     // Adjust down.
     if (max_adjustment_down) {
-      buff_lvl_step = (int)((cpi->oxcf.maximum_buffer_size -
-          cpi->oxcf.optimal_buffer_level) / max_adjustment_down);
-      if (buff_lvl_step) {
-        adjustment = (int)((cpi->rc.buffer_level -
-            cpi->oxcf.optimal_buffer_level) / buff_lvl_step);
-      }
+      buff_lvl_step = (int)((oxcf->maximum_buffer_size -
+          oxcf->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                            buff_lvl_step);
       active_worst_quality -= adjustment;
     }
-  } else if (cpi->rc.buffer_level > critical_level) {
+  } else if (rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (cpi->oxcf.optimal_buffer_level - critical_level);
+      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment =
-            (cpi->rc.worst_quality - cpi->rc.avg_frame_qindex[INTER_FRAME]) *
-            (cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level) /
-            buff_lvl_step;
+        adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                         (oxcf->optimal_buffer_level - rc->buffer_level) /
+                             buff_lvl_step;
       }
-      active_worst_quality = cpi->rc.avg_frame_qindex[INTER_FRAME] + adjustment;
+      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
     }
   } else {
     // Set to worst_quality if buffer is below critical level.
-    active_worst_quality = cpi->rc.worst_quality;
+    active_worst_quality = rc->worst_quality;
   }
   return active_worst_quality;
 }
 
 // Adjust target frame size with respect to the buffering constraints:
 static int target_size_from_buffer_level(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
   int this_frame_target = cpi->rc.this_frame_target;
   int percent_low = 0;
   int percent_high = 0;
-  int one_percent_bits = (int)(1 + cpi->oxcf.optimal_buffer_level / 100);
-  if (cpi->rc.buffer_level < cpi->oxcf.optimal_buffer_level) {
-    percent_low = (int)((cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level)
-        / one_percent_bits);
-    if (percent_low > cpi->oxcf.under_shoot_pct) {
-      percent_low = cpi->oxcf.under_shoot_pct;
-    }
+  int one_percent_bits = (int)(1 + oxcf->optimal_buffer_level / 100);
+  if (rc->buffer_level < oxcf->optimal_buffer_level) {
+    percent_low = (int)((oxcf->optimal_buffer_level - rc->buffer_level) /
+                      one_percent_bits);
+    if (percent_low > oxcf->under_shoot_pct)
+      percent_low = oxcf->under_shoot_pct;
+
     // Lower the target bandwidth for this frame.
     this_frame_target -= (this_frame_target * percent_low) / 200;
-  } else  if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
-    percent_high = (int)((cpi->rc.buffer_level - cpi->oxcf.optimal_buffer_level)
-        / one_percent_bits);
-    if (percent_high > cpi->oxcf.over_shoot_pct) {
-      percent_high = cpi->oxcf.over_shoot_pct;
-    }
+  } else  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+    percent_high = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                     one_percent_bits);
+    if (percent_high > oxcf->over_shoot_pct)
+      percent_high = oxcf->over_shoot_pct;
+
     // Increase the target bandwidth for this frame.
     this_frame_target += (this_frame_target * percent_high) / 200;
   }
@@ -378,25 +382,27 @@
 }
 
 static void calc_pframe_target_size(VP9_COMP *const cpi) {
-  int min_frame_target = MAX(cpi->rc.min_frame_bandwidth,
-                             cpi->rc.av_per_frame_bandwidth >> 5);
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int min_frame_target = MAX(rc->min_frame_bandwidth,
+                             rc->av_per_frame_bandwidth >> 5);
   if (cpi->refresh_alt_ref_frame) {
     // Special alt reference frame case
     // Per frame bit target for the alt ref frame
-    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
-    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
+    rc->this_frame_target = rc->per_frame_bandwidth;
   } else {
     // Normal frames (gf and inter).
-    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+    rc->this_frame_target = rc->per_frame_bandwidth;
     // Set target frame size based on buffer level, for 1 pass CBR.
-    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
       // Need to decide how low min_frame_target should be for 1-pass CBR.
       // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
-      min_frame_target = MAX(cpi->rc.av_per_frame_bandwidth >> 4,
+      min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
-      cpi->rc.this_frame_target = target_size_from_buffer_level(cpi);
+      rc->this_frame_target = target_size_from_buffer_level(cpi);
       // Adjust qp-max based on buffer level.
-      cpi->rc.active_worst_quality =
+      rc->active_worst_quality =
           adjust_active_worst_quality_from_buffer_level(cpi);
     }
   }
@@ -407,25 +413,24 @@
   // not capable of recovering all the extra bits we have spent in the KF or GF,
   // then the remainder will have to be recovered over a longer time span via
   // other buffer / rate control mechanisms.
-  if (cpi->rc.this_frame_target < min_frame_target) {
-    cpi->rc.this_frame_target = min_frame_target;
-  }
+  if (rc->this_frame_target < min_frame_target)
+    rc->this_frame_target = min_frame_target;
 
   // Adjust target frame size for Golden Frames:
   if (cpi->refresh_golden_frame) {
     // If we are using alternate ref instead of gf then do not apply the boost
     // It will instead be applied to the altref update
     // Jims modified boost
-    if (!cpi->rc.source_alt_ref_active) {
+    if (!rc->source_alt_ref_active) {
       // The spend on the GF is defined in the two pass code
       // for two pass encodes
-      cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+      rc->this_frame_target = rc->per_frame_bandwidth;
     } else {
       // If there is an active ARF at this location use the minimum
       // bits on this frame even if it is a constructed arf.
       // The active maximum quantizer insures that an appropriate
       // number of bits will be spent if needed for constructed ARFs.
-      cpi->rc.this_frame_target = 0;
+      rc->this_frame_target = 0;
     }
   }
 }
@@ -576,36 +581,34 @@
 }
 
 int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
-                                      int *bottom_index,
-                                      int *top_index) {
+                                      int *bottom_index, int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
   int active_best_quality;
-  int active_worst_quality = cpi->rc.active_worst_quality;
+  int active_worst_quality = rc->active_worst_quality;
   int q;
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = cpi->rc.best_quality;
+    active_best_quality = rc->best_quality;
 #if !CONFIG_MULTIPLE_ARF
     // Handle the special case for key frames forced when we have75 reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (cpi->rc.this_key_frame_forced) {
-      int delta_qindex;
-      int qindex = cpi->rc.last_boosted_qindex;
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-
-      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                        (last_boosted_q * 0.75));
-      active_best_quality = MAX(qindex + delta_qindex,
-                                cpi->rc.best_quality);
-    } else if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                            (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
       // not first frame of one pass
       double q_adj_factor = 1.0;
       double q_val;
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
       active_best_quality = get_active_quality(active_worst_quality,
-                                               cpi->rc.kf_boost,
+                                               rc->kf_boost,
                                                kf_low, kf_high,
                                                kf_low_motion_minq,
                                                kf_high_motion_minq);
@@ -631,29 +634,29 @@
     active_best_quality = active_worst_quality
         + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
 #endif
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
+  } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
 
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
-    if (cpi->rc.frames_since_key > 1 &&
-        cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = cpi->rc.avg_frame_qindex[INTER_FRAME];
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
     } else {
       q = active_worst_quality;
     }
     // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+    if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) {
       if (q < cpi->cq_target_quality)
         q = cpi->cq_target_quality;
-      if (cpi->rc.frames_since_key > 1) {
-        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+      if (rc->frames_since_key > 1) {
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
                                                  gf_low, gf_high,
                                                  afq_low_motion_minq,
                                                  afq_high_motion_minq);
       } else {
-        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
                                                  gf_low, gf_high,
                                                  gf_low_motion_minq,
                                                  gf_high_motion_minq);
@@ -661,46 +664,46 @@
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
-    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cpi->cq_target_quality;
       } else {
-        if (cpi->rc.frames_since_key > 1) {
+        if (rc->frames_since_key > 1) {
           active_best_quality = get_active_quality(
-              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              q, rc->gfu_boost, gf_low, gf_high,
               afq_low_motion_minq, afq_high_motion_minq);
         } else {
           active_best_quality = get_active_quality(
-              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              q, rc->gfu_boost, gf_low, gf_high,
               gf_low_motion_minq, gf_high_motion_minq);
         }
       }
     } else {
       active_best_quality = get_active_quality(
-          q, cpi->rc.gfu_boost, gf_low, gf_high,
+          q, rc->gfu_boost, gf_low, gf_high,
           gf_low_motion_minq, gf_high_motion_minq);
     }
   } else {
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
       active_best_quality = cpi->cq_target_quality;
     } else {
       if (cpi->pass == 0 &&
-          cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+          rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
         // 1-pass: for now, use the average Q for the active_best, if its lower
         // than active_worst.
-        active_best_quality = inter_minq[cpi->rc.avg_frame_qindex[INTER_FRAME]];
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
       else
         active_best_quality = inter_minq[active_worst_quality];
 
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
-      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+      if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (active_best_quality < cpi->cq_target_quality)) {
         // If we are strongly undershooting the target rate in the last
         // frames then use the user passed in cq value not the auto
         // cq value.
-        if (cpi->rc.rolling_actual_bits < cpi->rc.min_frame_bandwidth)
-          active_best_quality = cpi->oxcf.cq_level;
+        if (rc->rolling_actual_bits < rc->min_frame_bandwidth)
+          active_best_quality = oxcf->cq_level;
         else
           active_best_quality = cpi->cq_target_quality;
       }
@@ -708,14 +711,14 @@
   }
 
   // Clip the active best and worst quality values to limits
-  if (active_worst_quality > cpi->rc.worst_quality)
-    active_worst_quality = cpi->rc.worst_quality;
+  if (active_worst_quality > rc->worst_quality)
+    active_worst_quality = rc->worst_quality;
 
-  if (active_best_quality < cpi->rc.best_quality)
-    active_best_quality = cpi->rc.best_quality;
+  if (active_best_quality < rc->best_quality)
+    active_best_quality = rc->best_quality;
 
-  if (active_best_quality > cpi->rc.worst_quality)
-    active_best_quality = cpi->rc.worst_quality;
+  if (active_best_quality > rc->worst_quality)
+    active_best_quality = rc->worst_quality;
 
   if (active_worst_quality < active_best_quality)
     active_worst_quality = active_best_quality;
@@ -725,29 +728,34 @@
 
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !cpi->rc.this_key_frame_forced) {
-    if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+    if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
       *top_index =
           (active_worst_quality + active_best_quality * 3) / 4;
     }
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
-             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+  } else if (!rc->is_src_frame_alt_ref &&
+             (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (active_worst_quality + active_best_quality) / 2;
   }
 #endif
 
-  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+  if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
     q = active_best_quality;
   // Special case code to try and match quality with forced key frames
-  } else if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
-    q = cpi->rc.last_boosted_qindex;
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
   } else {
-    q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                           active_best_quality, active_worst_quality);
-    if (q > *top_index)
-      q = *top_index;
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
   }
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
@@ -766,12 +774,11 @@
     printf("frame:%d q:%d\n", cm->current_video_frame, q);
   }
 #endif
-  assert(*top_index <= cpi->rc.worst_quality &&
-         *top_index >= cpi->rc.best_quality);
-  assert(*bottom_index <= cpi->rc.worst_quality &&
-         *bottom_index >= cpi->rc.best_quality);
-  assert(q <= cpi->rc.worst_quality &&
-         q >= cpi->rc.best_quality);
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
   return q;
 }
 
@@ -810,6 +817,11 @@
     *frame_under_shoot_limit -= 200;
     if (*frame_under_shoot_limit < 0)
       *frame_under_shoot_limit = 0;
+
+    // Clip to maximum allowed rate for a frame.
+    if (*frame_over_shoot_limit > cpi->rc.max_frame_bandwidth) {
+      *frame_over_shoot_limit = cpi->rc.max_frame_bandwidth;
+    }
   }
 }
 
@@ -822,6 +834,10 @@
   else
     calc_pframe_target_size(cpi);
 
+  // Clip the frame target to the maximum allowed value.
+  if (cpi->rc.this_frame_target > cpi->rc.max_frame_bandwidth)
+    cpi->rc.this_frame_target = cpi->rc.max_frame_bandwidth;
+
   // Target rate per SB64 (including partial SB64s.
   cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
                              (cpi->common.width * cpi->common.height);
@@ -843,24 +859,26 @@
 }
 
 static void update_golden_frame_stats(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
     // this frame refreshes means next frames don't unless specified by user
-    cpi->rc.frames_since_golden = 0;
+    rc->frames_since_golden = 0;
 
-    if (!cpi->rc.source_alt_ref_pending)
-      cpi->rc.source_alt_ref_active = 0;
+    if (!rc->source_alt_ref_pending)
+      rc->source_alt_ref_active = 0;
 
     // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
 
   } else if (!cpi->refresh_alt_ref_frame) {
     // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
 
-    cpi->rc.frames_since_golden++;
+    rc->frames_since_golden++;
   }
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b46e808..5ba8915 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -134,27 +134,27 @@
   return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
 
-static void fill_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *const cm = &c->common;
+static void fill_mode_costs(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  FRAME_CONTEXT *const fc = &cm->fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++)
     for (j = 0; j < INTRA_MODES; j++)
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+      vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
-                  vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
-                  vp9_intra_mode_tree);
+  vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+  vp9_cost_tokens(x->intra_uv_mode_cost[1],
+                  fc->uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+  vp9_cost_tokens(x->intra_uv_mode_cost[0],
+                  vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    cm->fc.switchable_interp_prob[i],
+    vp9_cost_tokens((int *)x->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
 }
 
@@ -198,9 +198,9 @@
   // This is to make it easier to resolve the impact of experimental changes
   // to the quantizer tables.
   for (i = 0; i < QINDEX_RANGE; i++) {
-    sad_per_bit16lut[i] =
-      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
+    const double q = vp9_convert_qindex_to_q(i);
+    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
@@ -234,36 +234,30 @@
 static void set_block_thresholds(VP9_COMP *cpi) {
   int i, bsize, segment_id;
   VP9_COMMON *cm = &cpi->common;
+  SPEED_FEATURES *sf = &cpi->sf;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
-    int q;
-    int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-    segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
-    q = compute_rd_thresh_factor(segment_qindex);
+    const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
+                                            cm->base_qindex) + cm->y_dc_delta_q,
+                             0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       // Threshold here seem unecessarily harsh but fine given actual
       // range of values used for cpi->sf.thresh_mult[]
-      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
 
-      for (i = 0; i < MAX_MODES; ++i) {
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[segment_id][bsize][i] =
-              cpi->sf.thresh_mult[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
-        }
-      }
+      for (i = 0; i < MAX_MODES; ++i)
+        cpi->rd_threshes[segment_id][bsize][i] =
+            sf->thresh_mult[i] < thresh_max ? sf->thresh_mult[i] * t / 4
+                                            : INT_MAX;
 
       for (i = 0; i < MAX_REFS; ++i) {
-        if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
-              cpi->sf.thresh_mult_sub8x8[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
-        }
+        cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
+            sf->thresh_mult_sub8x8[i] < thresh_max
+                ? sf->thresh_mult_sub8x8[i] * t / 4
+                : INT_MAX;
       }
     }
   }
@@ -271,6 +265,7 @@
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
   int qindex, i;
 
   vp9_clear_system_state();  // __asm emms;
@@ -284,35 +279,32 @@
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
   cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
 
-  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+  x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO + (x->errorperbit == 0);
 
   vp9_set_speed_features(cpi);
 
-  cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
-                              cm->frame_type != KEY_FRAME) ?
-                              0 : 1;
+  x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                         cm->frame_type != KEY_FRAME) ? 0 : 1;
 
   set_block_thresholds(cpi);
 
-  fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+  fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
+    vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
                     vp9_partition_tree);
 
-  /*rough estimate for costing*/
   fill_mode_costs(cpi);
 
   if (!frame_is_intra_only(cm)) {
-    vp9_build_nmv_cost_table(
-        cpi->mb.nmvjointcost,
-        cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
-        &cm->fc.nmvc,
-        cm->allow_high_precision_mv, 1, 1);
+    vp9_build_nmv_cost_table(x->nmvjointcost,
+                             cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                         : x->nmvcost,
+                             &cm->fc.nmvc,
+                             cm->allow_high_precision_mv, 1, 1);
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+      vp9_cost_tokens((int *)x->inter_mode_cost[i],
                       cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
   }
 }
@@ -464,8 +456,8 @@
   BLOCK_SIZE bs;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
-  const int height = 4 << num_4x4_blocks_high_lookup[bsize];
+  const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
   int64_t dist_sum = 0;
   const int t = 4 << tx_size;
@@ -640,7 +632,9 @@
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args encode_args = {x, NULL};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args encode_args = {x, NULL, &mbmi->skip_coeff};
+
   int64_t rd1, rd2, rd;
 
   if (args->skip)
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 970a27a..b04e3fe 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -175,6 +175,18 @@
   set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, aoff, loff);
 }
 
+static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+                             int16_t extra, uint8_t token,
+                             uint8_t skip_eob_node,
+                             unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -186,9 +198,9 @@
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0, rc = 0;
+  int c = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const int eob = p->eobs[block];
+  int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
@@ -197,51 +209,53 @@
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
 
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
-  assert((!type && !plane) || (type && plane));
-
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                                    pd->left_context + loff);
+                           pd->left_context + loff);
   so = get_scan(xd, tx_size, type, block);
   scan = so->scan;
   nb = so->neighbors;
-
   c = 0;
-  do {
-    const int band = band_translate[c];
-    int token;
+  while (c < eob) {
     int v = 0;
-    rc = scan[c];
-    if (c)
+    int skip_eob = 0;
+    v = qcoeff_ptr[scan[c]];
+
+    while (!v) {
+      add_token(&t, coef_probs[type][ref][band[c]][pt], 0, ZERO_TOKEN, skip_eob,
+                counts[type][ref][band[c]][pt]);
+
+      cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] +=
+          !skip_eob;
+
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
       pt = get_coef_context(nb, token_cache, c);
-    if (c < eob) {
-      v = qcoeff_ptr[rc];
-      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
-
-      t->extra = vp9_dct_value_tokens_ptr[v].extra;
-      token    = vp9_dct_value_tokens_ptr[v].token;
-    } else {
-      token = EOB_TOKEN;
+      v = qcoeff_ptr[scan[c]];
     }
+    add_token(&t, coef_probs[type][ref][band[c]][pt],
+              vp9_dct_value_tokens_ptr[v].extra,
+              vp9_dct_value_tokens_ptr[v].token, skip_eob,
+              counts[type][ref][band[c]][pt]);
 
-    t->token = token;
-    t->context_tree = coef_probs[type][ref][band][pt];
-    t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+    cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += !skip_eob;
 
-    assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-
-    ++counts[type][ref][band][pt][token];
-    if (!t->skip_eob_node)
-      ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
-
-    token_cache[rc] = vp9_pt_energy_class[token];
-    ++t;
-  } while (c < eob && ++c < seg_eob);
+    token_cache[scan[c]] =
+        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token(&t, coef_probs[type][ref][band[c]][pt], 0, EOB_TOKEN, 0,
+              counts[type][ref][band[c]][pt]);
+    ++cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt];
+  }
 
   *tp = t;
 
@@ -285,8 +299,6 @@
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
-
-  mbmi->skip_coeff = sb_is_skippable(&cpi->mb, bsize);
   if (mbmi->skip_coeff) {
     if (!dry_run)
       cm->counts.mbskip[ctx][1] += skip_inc;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b1c029c..478b45a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,9 @@
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
+ifeq ($(ARCH_X86_64),yes)
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
+endif
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vpxdec.c b/vpxdec.c
index 97ac4bb..4204979 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -40,13 +40,12 @@
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);
   uint32_t fourcc;
-  uint32_t fourcc_mask;
 } ifaces[] = {
 #if CONFIG_VP8_DECODER
-  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC_MASK, 0x00FFFFFF},
+  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC},
 #endif
 #if CONFIG_VP9_DECODER
-  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC_MASK, 0x00FFFFFF},
+  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC},
 #endif
 };
 
@@ -167,11 +166,10 @@
   exit(EXIT_FAILURE);
 }
 
-static int raw_read_frame(struct VpxInputContext *input_ctx, uint8_t **buffer,
+static int raw_read_frame(FILE *infile, uint8_t **buffer,
                           size_t *bytes_read, size_t *buffer_size) {
   char raw_hdr[RAW_FRAME_HDR_SZ];
   size_t frame_size = 0;
-  FILE *infile = input_ctx->file;
 
   if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
     if (!feof(infile))
@@ -221,10 +219,10 @@
       return webm_read_frame(input->webm_ctx,
                              buf, bytes_in_buffer, buffer_size);
     case FILE_TYPE_RAW:
-      return raw_read_frame(input->vpx_input_ctx,
+      return raw_read_frame(input->vpx_input_ctx->file,
                             buf, bytes_in_buffer, buffer_size);
     case FILE_TYPE_IVF:
-      return ivf_read_frame(input->vpx_input_ctx,
+      return ivf_read_frame(input->vpx_input_ctx->file,
                             buf, bytes_in_buffer, buffer_size);
     default:
       return 1;
@@ -671,7 +669,7 @@
 
   /* Try to determine the codec from the fourcc. */
   for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    if ((vpx_input_ctx.fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
+    if (vpx_input_ctx.fourcc == ifaces[i].fourcc) {
       vpx_codec_iface_t *vpx_iface = ifaces[i].iface();
 
       if (iface && iface != vpx_iface)
diff --git a/vpxenc.c b/vpxenc.c
index 4c933ce..396e43d 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -362,12 +362,6 @@
                                                "Motion detection threshold");
 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
                                           "CPU Used (-16..16)");
-static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
-                                     "Number of token partitions to use, log2");
-static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1,
-                                         "Number of tile columns to use, log2");
-static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1,
-                                           "Number of tile rows to use, log2");
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                              "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
@@ -387,16 +381,10 @@
                                           "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
                                                     "Max I-frame bitrate (pct)");
-static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
-#if CONFIG_VP9_ENCODER
-static const arg_def_t frame_parallel_decoding  = ARG_DEF(
-    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
-static const arg_def_t aq_mode  = ARG_DEF(
-    NULL, "aq-mode", 1,
-    "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
-#endif
 
 #if CONFIG_VP8_ENCODER
+static const arg_def_t token_parts =
+    ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t *vp8_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -414,6 +402,17 @@
 #endif
 
 #if CONFIG_VP9_ENCODER
+static const arg_def_t tile_cols =
+    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+static const arg_def_t frame_parallel_decoding = ARG_DEF(
+    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+static const arg_def_t aq_mode = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
+
 static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -1393,6 +1392,10 @@
 static void open_output_file(struct stream_state *stream,
                              struct VpxEncoderConfig *global) {
   const char *fn = stream->config.out_fn;
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
 
   stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
 
@@ -1404,18 +1407,23 @@
 
   if (stream->config.write_webm) {
     stream->ebml.stream = stream->file;
-    write_webm_file_header(&stream->ebml, &stream->config.cfg,
+    write_webm_file_header(&stream->ebml, cfg,
                            &global->framerate,
                            stream->config.stereo_fmt,
                            global->codec->fourcc);
-  } else
-    ivf_write_file_header(stream->file, &stream->config.cfg,
-                          global->codec->fourcc, 0);
+  } else {
+    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+  }
 }
 
 
 static void close_output_file(struct stream_state *stream,
-                              unsigned int         fourcc) {
+                              unsigned int fourcc) {
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
+
   if (stream->config.write_webm) {
     write_webm_file_footer(&stream->ebml, stream->hash);
     free(stream->ebml.cue_list);
diff --git a/webmdec.c b/webmdec.c
index 0c75d7a..fdcf3a5 100644
--- a/webmdec.c
+++ b/webmdec.c
@@ -82,9 +82,9 @@
 
   codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
   if (codec_id == NESTEGG_CODEC_VP8) {
-    vpx_ctx->fourcc = VP8_FOURCC_MASK;
+    vpx_ctx->fourcc = VP8_FOURCC;
   } else if (codec_id == NESTEGG_CODEC_VP9) {
-    vpx_ctx->fourcc = VP9_FOURCC_MASK;
+    vpx_ctx->fourcc = VP9_FOURCC;
   } else {
     fatal("Not VPx video, quitting.\n");
   }