Merge "Change the vp8 END_USAGE typedef"
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index bfa931e..2ceb17a 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -37,12 +37,24 @@
 #include "./vpx_config.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
-#define interface (vpx_codec_vp8_dx())
 #include "md5_utils.h"
 
+#define VP8_FOURCC 0x30385056
+#define VP9_FOURCC 0x30395056
+
 #define IVF_FILE_HDR_SZ  (32)
 #define IVF_FRAME_HDR_SZ (12)
 
+static vpx_codec_iface_t *get_codec_interface(unsigned int fourcc) {
+  switch (fourcc) {
+    case VP8_FOURCC:
+      return vpx_codec_vp8_dx();
+    case VP9_FOURCC:
+      return vpx_codec_vp9_dx();
+  }
+  return NULL;
+}
+
 static unsigned int mem_get_le32(const unsigned char *mem) {
   return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
 }
@@ -52,7 +64,7 @@
 
   va_start(ap, fmt);
   vprintf(fmt, ap);
-  if(fmt[strlen(fmt)-1] != '\n')
+  if (fmt[strlen(fmt) - 1] != '\n')
     printf("\n");
   exit(EXIT_FAILURE);
 }
@@ -66,81 +78,93 @@
   exit(EXIT_FAILURE);
 }
 
+static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
+  int plane, y;
+  MD5Context md5;
 
-int main(int argc, char **argv) {
-  FILE            *infile, *outfile;
-  vpx_codec_ctx_t  codec;
-  int              flags = 0, frame_cnt = 0;
-  unsigned char    file_hdr[IVF_FILE_HDR_SZ];
-  unsigned char    frame_hdr[IVF_FRAME_HDR_SZ];
-  unsigned char    frame[256*1024];
-  vpx_codec_err_t  res;
+  MD5Init(&md5);
 
-  (void)res;
-  /* Open files */
-  if(argc!=3)
-    die("Usage: %s <infile> <outfile>\n", argv[0]);
-  if(!(infile = fopen(argv[1], "rb")))
-    die("Failed to open %s for reading", argv[1]);
-  if(!(outfile = fopen(argv[2], "wb")))
-    die("Failed to open %s for writing", argv[2]);
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+    const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
 
-  /* Read file header */
-  if(!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ
-       && file_hdr[0]=='D' && file_hdr[1]=='K' && file_hdr[2]=='I'
-       && file_hdr[3]=='F'))
-    die("%s is not an IVF file.", argv[1]);
-
-  printf("Using %s\n",vpx_codec_iface_name(interface));
-  /* Initialize codec */
-  if(vpx_codec_dec_init(&codec, interface, NULL, flags))
-    die_codec(&codec, "Failed to initialize decoder");
-
-  /* Read each frame */
-  while(fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
-    int               frame_sz = mem_get_le32(frame_hdr);
-    vpx_codec_iter_t  iter = NULL;
-    vpx_image_t      *img;
-
-    frame_cnt++;
-    if(frame_sz > sizeof(frame))
-      die("Frame %d data too big for example code buffer", frame_sz);
-    if(fread(frame, 1, frame_sz, infile) != frame_sz)
-      die("Frame %d failed to read complete frame", frame_cnt);
-
-    /* Decode the frame */
-    if(vpx_codec_decode(&codec, frame, frame_sz, NULL, 0))
-      die_codec(&codec, "Failed to decode frame");
-
-    /* Write decoded data to disk */
-    while((img = vpx_codec_get_frame(&codec, &iter))) {
-      unsigned int plane, y;
-
-      unsigned char  md5_sum[16];
-      MD5Context     md5;
-      int            i;
-
-      MD5Init(&md5);
-
-      for(plane=0; plane < 3; plane++) {
-        unsigned char *buf =img->planes[plane];
-
-        for (y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-          MD5Update(&md5, buf, (plane ? (img->d_w + 1) >> 1 : img->d_w));
-          buf += img->stride[plane];
-        }
-      }
-
-      MD5Final(md5_sum, &md5);
-      for (i = 0; i < 16; i++)
-        fprintf(outfile, "%02x",md5_sum[i]);
-      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
-              frame_cnt);
+    for (y = 0; y < h; ++y) {
+      MD5Update(&md5, buf, w);
+      buf += stride;
     }
   }
 
-  printf("Processed %d frames.\n",frame_cnt);
-  if(vpx_codec_destroy(&codec))
+  MD5Final(digest, &md5);
+}
+
+static void print_md5(FILE *stream, unsigned char digest[16]) {
+  int i;
+
+  for (i = 0; i < 16; ++i)
+    fprintf(stream, "%02x", digest[i]);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile, *outfile;
+  vpx_codec_ctx_t codec;
+  vpx_codec_iface_t *iface;
+  int flags = 0, frame_cnt = 0;
+  unsigned char file_hdr[IVF_FILE_HDR_SZ];
+  unsigned char frame_hdr[IVF_FRAME_HDR_SZ];
+  unsigned char frame[256 * 1024];
+
+  if (argc != 3)
+    die("Usage: %s <infile> <outfile>\n", argv[0]);
+
+  if (!(infile = fopen(argv[1], "rb")))
+    die("Failed to open %s for reading", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing", argv[2]);
+
+  if (!(fread(file_hdr, 1, IVF_FILE_HDR_SZ, infile) == IVF_FILE_HDR_SZ &&
+     file_hdr[0] == 'D' && file_hdr[1] == 'K' &&
+     file_hdr[2] == 'I' && file_hdr[3] == 'F'))
+    die("%s is not an IVF file.", argv[1]);
+
+  iface = get_codec_interface(mem_get_le32(file_hdr + 8));
+  if (!iface)
+    die("Unknown FOURCC code.");
+
+
+  printf("Using %s\n", vpx_codec_iface_name(iface));
+
+  if (vpx_codec_dec_init(&codec, iface, NULL, flags))
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (fread(frame_hdr, 1, IVF_FRAME_HDR_SZ, infile) == IVF_FRAME_HDR_SZ) {
+    const int frame_size = mem_get_le32(frame_hdr);
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img;
+
+    if (frame_size > sizeof(frame))
+      die("Frame %d data too big for example code buffer", frame_size);
+
+    if (fread(frame, 1, frame_size, infile) != frame_size)
+      die("Failed to read complete frame");
+
+    if (vpx_codec_decode(&codec, frame, frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      unsigned char digest[16];
+
+      get_image_md5(img, digest);
+      print_md5(outfile, digest);
+      fprintf(outfile, "  img-%dx%d-%04d.i420\n",
+              img->d_w, img->d_h, ++frame_cnt);
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
     die_codec(&codec, "Failed to destroy codec");
 
   fclose(outfile);
diff --git a/ivfdec.c b/ivfdec.c
index a37a44c..c7f4a89 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -13,6 +13,25 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+static void fix_framerate(int *num, int *den) {
+  // Some versions of vpxenc used 1/(2*fps) for the timebase, so
+  // we can guess the framerate using only the timebase in this
+  // case. Other files would require reading ahead to guess the
+  // timebase, like we do for webm.
+  if (*num < 1000) {
+    // Correct for the factor of 2 applied to the timebase in the encoder.
+    if (*num & 1)
+      *den *= 2;
+    else
+      *num /= 2;
+  } else {
+    // Don't know FPS for sure, and don't have readahead code
+    // (yet?), so just default to 30fps.
+    *num = 30;
+    *den = 1;
+  }
+}
+
 int file_is_ivf(struct VpxInputContext *input_ctx) {
   char raw_hdr[32];
   int is_ivf = 0;
@@ -32,27 +51,8 @@
       input_ctx->height = mem_get_le16(raw_hdr + 14);
       input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
       input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
-
-      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
-       * we can guess the framerate using only the timebase in this
-       * case. Other files would require reading ahead to guess the
-       * timebase, like we do for webm.
-       */
-      if (input_ctx->framerate.numerator < 1000) {
-        /* Correct for the factor of 2 applied to the timebase in the
-         * encoder.
-         */
-        if (input_ctx->framerate.numerator & 1)
-          input_ctx->framerate.denominator <<= 1;
-        else
-          input_ctx->framerate.numerator >>= 1;
-      } else {
-        /* Don't know FPS for sure, and don't have readahead code
-         * (yet?), so just default to 30fps.
-         */
-        input_ctx->framerate.numerator = 30;
-        input_ctx->framerate.denominator = 1;
-      }
+      fix_framerate(&input_ctx->framerate.numerator,
+                    &input_ctx->framerate.denominator);
     }
   }
 
@@ -65,16 +65,10 @@
   return is_ivf;
 }
 
-int ivf_read_frame(struct VpxInputContext *input_ctx,
-                   uint8_t **buffer,
-                   size_t *bytes_read,
-                   size_t *buffer_size) {
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size) {
   char raw_header[IVF_FRAME_HDR_SZ] = {0};
   size_t frame_size = 0;
-  FILE *infile = input_ctx->file;
-
-  if (input_ctx->file_type != FILE_TYPE_IVF)
-    return 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
     if (!feof(infile))
diff --git a/ivfdec.h b/ivfdec.h
index 5da9acc..dd29cc6 100644
--- a/ivfdec.h
+++ b/ivfdec.h
@@ -18,10 +18,8 @@
 
 int file_is_ivf(struct VpxInputContext *input);
 
-int ivf_read_frame(struct VpxInputContext *input,
-                   uint8_t **buffer,
-                   size_t *bytes_read,
-                   size_t *buffer_size);
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size);
 
 #ifdef __cplusplus
 }  /* extern "C" */
diff --git a/ivfenc.c b/ivfenc.c
index fa92566..0041ff0 100644
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -20,9 +20,6 @@
                            int frame_cnt) {
   char header[32];
 
-  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-    return;
-
   header[0] = 'D';
   header[1] = 'K';
   header[2] = 'I';
@@ -44,9 +41,6 @@
   char header[12];
   vpx_codec_pts_t pts;
 
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
   pts = pkt->data.frame.pts;
   mem_put_le32(header, (int)pkt->data.frame.sz);
   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
diff --git a/test/test.mk b/test/test.mk
index 5a1d39d..6d4f969 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,6 +24,8 @@
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += y4m_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
new file mode 100644
index 0000000..bd86c2c
--- /dev/null
+++ b/test/y4m_video_source.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_Y4M_VIDEO_SOURCE_H_
+#define TEST_Y4M_VIDEO_SOURCE_H_
+#include <string>
+
+#include "test/video_source.h"
+extern "C" {
+#include "./y4minput.h"
+}
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+  Y4mVideoSource(const std::string &file_name,
+                  unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(new vpx_image_t()),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        framerate_numerator_(0),
+        framerate_denominator_(0),
+        y4m_() {
+  }
+
+  virtual ~Y4mVideoSource() {
+    vpx_img_free(img_.get());
+    y4m_input_close(&y4m_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+        << file_name_;
+
+    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
+    framerate_numerator_ = y4m_.fps_n;
+    framerate_denominator_ = y4m_.fps_d;
+
+    frame_ = 0;
+    for (unsigned int i = 0; i < start_; i++) {
+        Next();
+    }
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_.get() : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  testing::internal::scoped_ptr<vpx_image_t> img_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  y4m_input y4m_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/tools_common.h b/tools_common.h
index 7500523..1d70ab5 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -63,10 +63,8 @@
 
 #define RAW_FRAME_HDR_SZ sizeof(uint32_t)
 
-#define VP8_FOURCC (0x30385056)
-#define VP9_FOURCC (0x30395056)
-#define VP8_FOURCC_MASK (0x00385056)
-#define VP9_FOURCC_MASK (0x00395056)
+#define VP8_FOURCC 0x30385056
+#define VP9_FOURCC 0x30395056
 
 enum VideoFileType {
   FILE_TYPE_RAW,
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index f106bc7..98619bb 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -16,6 +16,7 @@
     EXPORT  |vp9_h_predictor_8x8_neon|
     EXPORT  |vp9_h_predictor_16x16_neon|
     EXPORT  |vp9_h_predictor_32x32_neon|
+    EXPORT  |vp9_tm_predictor_4x4_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -283,4 +284,52 @@
     bx                  lr
     ENDP                ; |vp9_h_predictor_32x32_neon|
 
+;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_4x4_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             d0, r12
+
+    ; Load above 4 pixels
+    vld1.32             {d2[0]}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+
+    ; 3rd row and 4th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_4x4_neon|
+
     END
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 21e2b16..2d5df4a 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -242,6 +242,9 @@
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  /* mc buffer */
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+
   int lossless;
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index b457604..45d7984 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -43,8 +43,8 @@
 
 
   typedef enum {
-    USAGE_STREAM_FROM_SERVER    = 0x0,
-    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+    USAGE_LOCAL_FILE_PLAYBACK   = 0x0,
+    USAGE_STREAM_FROM_SERVER    = 0x1,
     USAGE_CONSTRAINED_QUALITY   = 0x2,
     USAGE_CONSTANT_QUALITY      = 0x3,
   } END_USAGE;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 212a28a..a172ba6 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -13,13 +13,16 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_postproc.h"
-#include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_textblit.h"
 
 #define RGB_TO_YUV(t)                                            \
   ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \
@@ -127,9 +130,6 @@
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
-
-/****************************************************************************
- */
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -371,7 +371,7 @@
   }
 }
 
-double vp9_gaussian(double sigma, double mu, double x) {
+static double gaussian(double sigma, double mu, double x) {
   return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
          (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
@@ -396,7 +396,7 @@
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
       if (a) {
         for (j = 0; j < a; j++) {
@@ -425,27 +425,6 @@
   state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start  starting address of buffer to
- *                                        add gaussian noise to
- *                  unsigned int width    width of plane
- *                  unsigned int height   height of plane
- *                  int  pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
 void vp9_plane_add_noise_c(uint8_t *start, char *noise,
                            char blackclamp[16],
                            char whiteclamp[16],
@@ -628,49 +607,40 @@
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  int q = cm->lf.filter_level * 10 / 6;
-  int flags = ppflags->post_proc_flag;
-  int deblock_level = ppflags->deblocking_level;
-  int noise_level = ppflags->noise_level;
+  const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+  const int flags = ppflags->post_proc_flag;
+  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+  struct postproc_state *const ppstate = &cm->postproc_state;
 
   if (!cm->frame_to_show)
     return -1;
 
-  if (q > 63)
-    q = 63;
-
   if (!flags) {
     *dest = *cm->frame_to_show;
     return 0;
   }
 
-#if ARCH_X86||ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
+  vp9_clear_system_state();
 
   if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
-                               q + (deblock_level - 5) * 10, 1, 0);
+    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
+    vp9_deblock(cm->frame_to_show, ppbuf, q);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
+    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   if (flags & VP9D_ADDNOISE) {
-    if (cm->postproc_state.last_q != q
-        || cm->postproc_state.last_noise != noise_level) {
-      fillrd(&cm->postproc_state, 63 - q, noise_level);
+    const int noise_level = ppflags->noise_level;
+    if (ppstate->last_q != q ||
+        ppstate->last_noise != noise_level) {
+      fillrd(ppstate, 63 - q, noise_level);
     }
 
-    vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
-                        cm->postproc_state.noise,
-                        cm->postproc_state.blackclamp,
-                        cm->postproc_state.whiteclamp,
-                        cm->postproc_state.bothclamp,
-                        cm->post_proc_buffer.y_width,
-                        cm->post_proc_buffer.y_height,
-                        cm->post_proc_buffer.y_stride);
+    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+                        ppstate->whiteclamp, ppstate->bothclamp,
+                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
 
 #if 0 && CONFIG_POSTPROC_VISUALIZER
@@ -684,16 +654,14 @@
              cm->filter_level,
              flags,
              cm->mb_cols, cm->mb_rows);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+    vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
   }
 
   if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
+    int mb_rows = ppbuf->y_height >> 4;
+    int mb_cols = ppbuf->y_width  >> 4;
     int mb_index = 0;
     MODE_INFO *mi = cm->mi;
 
@@ -719,9 +687,8 @@
   if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
     int i, j;
     uint8_t *y_ptr;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int mb_rows = post->y_height >> 4;
-    int mb_cols = post->y_width  >> 4;
+    int mb_rows = ppbuf->y_height >> 4;
+    int mb_cols = ppbuf->y_width  >> 4;
     int mb_index = 0;
     MODE_INFO *mi = cm->mi;
 
@@ -755,17 +722,15 @@
     snprintf(message, sizeof(message),
              "Bitrate: %10.2f framerate: %10.2f ",
              cm->bitrate, cm->framerate);
-    vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
-                  cm->post_proc_buffer.y_stride);
+    vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
   }
 
   /* Draw motion vectors */
   if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_buffer = ppbuf->y_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
     int x0, y0;
 
@@ -904,13 +869,12 @@
   if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
       && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_ptr = ppbuf->y_buffer;
+    uint8_t *u_ptr = ppbuf->u_buffer;
+    uint8_t *v_ptr = ppbuf->v_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
@@ -969,13 +933,12 @@
   if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
       ppflags->display_ref_frame_flag) {
     int y, x;
-    YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
-    int width  = post->y_width;
-    int height = post->y_height;
-    uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
-    uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
-    uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
-    int y_stride = cm->post_proc_buffer.y_stride;
+    int width  = ppbuf->y_width;
+    int height = ppbuf->y_height;
+    uint8_t *y_ptr = ppbuf->y_buffer;
+    uint8_t *u_ptr = ppbuf->u_buffer;
+    uint8_t *v_ptr = ppbuf->v_buffer;
+    int y_stride = ppbuf->y_stride;
     MODE_INFO *mi = cm->mi;
 
     for (y = 0; y < height; y += 16) {
@@ -1002,7 +965,7 @@
   }
 #endif
 
-  *dest = cm->post_proc_buffer;
+  *dest = *ppbuf;
 
   /* handle problem with extending borders */
   dest->y_width = cm->width;
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index c63beae..b8a456f 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -13,6 +13,7 @@
 #define VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_ppflags.h"
 
 struct postproc_state {
   int last_q;
@@ -23,8 +24,7 @@
   DECLARE_ALIGNED(16, char, bothclamp[16]);
 };
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_ppflags.h"
+struct VP9Common;
 
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index 6148206..7a790c5 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -39,17 +39,12 @@
 
 typedef const vp9_tree_index vp9_tree[];
 
-/* Convert array of token occurrence counts into a table of probabilities
-   for the associated binary encoding tree.  Also writes count of branches
-   taken for each node on the tree; this facilitiates decisions as to
-   probability updates. */
-
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
 
 // int64 is not needed for normal frame level calculations.
-// However when outputing entropy stats accumulated over many frames
+// However when outputting entropy stats accumulated over many frames
 // or even clips we can overflow int math.
 #ifdef ENTROPY_STATS
 static INLINE vp9_prob get_prob(int num, int den) {
@@ -65,7 +60,7 @@
   return get_prob(n0, n0 + n1);
 }
 
-/* this function assumes prob1 and prob2 are already within [1,255] range */
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
 static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 397f446..b5a9248 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,15 +20,16 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
-                             int x, int y, int b_w, int b_h, int w, int h) {
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * stride;
+  const uint8_t *ref_row = src - x - y * src_stride;
 
   if (y >= h)
-    ref_row += (h - 1) * stride;
+    ref_row += (h - 1) * src_stride;
   else if (y > 0)
-    ref_row += y * stride;
+    ref_row += y * src_stride;
 
   do {
     int right = 0, copy;
@@ -49,16 +50,16 @@
       memset(dst, ref_row[0], left);
 
     if (copy)
-      memmove(dst + left, ref_row + x + left, copy);
+      memcpy(dst + left, ref_row + x + left, copy);
 
     if (right)
       memset(dst + left + copy, ref_row[w - 1], right);
 
-    dst += stride;
+    dst += dst_stride;
     ++y;
 
     if (y > 0 && y < h)
-      ref_row += stride;
+      ref_row += src_stride;
   } while (--b_h);
 }
 
@@ -281,7 +282,7 @@
 
     MV32 scaled_mv;
     int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
-        frame_height, subpel_x, subpel_y;
+        frame_height, subpel_x, subpel_y, buf_stride;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
 
@@ -308,7 +309,7 @@
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
-      // Get block position in the scaled reference frame.
+      // Map the top left corner of the block into the reference frame.
       x0 = sf->scale_value_x(x0, sf);
       y0 = sf->scale_value_y(y0, sf);
       x0_16 = sf->scale_value_x(x0_16, sf);
@@ -321,7 +322,7 @@
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Get reference block top left coordinate.
+    // Calculate the top left corner of the best matching block in the reference frame.
     x0 += scaled_mv.col >> SUBPEL_BITS;
     y0 += scaled_mv.row >> SUBPEL_BITS;
     x0_16 += scaled_mv.col;
@@ -329,24 +330,28 @@
 
     // Get reference block bottom right coordinate.
     x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    y1 = ((y0_16 + (h - 1) * xs) >> SUBPEL_BITS) + 1;
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
 
     // Get reference block pointer.
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
 
-    // Do border extension if there is motion or
+    // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
+      int x_pad = 0, y_pad = 0;
 
-      if (subpel_x) {
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
       }
 
-      if (subpel_y) {
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
         y0 -= VP9_INTERP_EXTEND - 1;
         y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
       }
 
       // Skip border extension if block is inside the frame.
@@ -354,12 +359,14 @@
           y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
         uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
         // Extend the border.
-        build_mc_border(buf_ptr1, buf_ptr1, pre_buf->stride, x0, y0, x1 - x0,
-                        y1 - y0, frame_width, frame_height);
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0,
+                        x0, y0, x1 - x0, y1 - y0, frame_width, frame_height);
+        buf_stride = x1 - x0;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
       }
     }
 
-    inter_predictor(buf_ptr, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
   }
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index a79b45c..c2468c1 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -57,7 +57,7 @@
 specialize vp9_v_predictor_4x4 $sse_x86inc neon
 
 prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2
+specialize vp9_tm_predictor_4x4 $sse_x86inc neon dspr2
 
 prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2
@@ -756,9 +756,5 @@
 prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2
 
-prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
-specialize vp9_yv12_copy_partial_frame
-
-
 fi
 # end encoder functions
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c813781..2eb99ea 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -357,9 +357,9 @@
 }
 
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
-                             int_mv mv[2], int_mv best_mv[2],
-                             int_mv nearest_mv[2], int_mv near_mv[2],
-                             int is_compound, int allow_hp, vp9_reader *r) {
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int is_compound, int allow_hp, vp9_reader *r) {
   int i;
   int ret = 1;
 
@@ -367,10 +367,10 @@
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
               &cm->fc.nmvc, mv_counts, allow_hp);
       if (is_compound)
-        read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
                 &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
         ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
@@ -380,17 +380,20 @@
     }
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) mv[1].as_int = near_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
       break;
     }
     case ZEROMV: {
       mv[0].as_int = 0;
-      if (is_compound) mv[1].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
       break;
     }
     default: {
@@ -423,7 +426,7 @@
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
 
-  int_mv nearest[2], nearmv[2], best[2];
+  int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
@@ -452,8 +455,7 @@
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
-                            &nearest[ref], &nearmv[ref]);
-      best[ref].as_int = nearest[ref].as_int;
+                            &nearestmv[ref], &nearmv[ref]);
     }
   }
 
@@ -466,6 +468,7 @@
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
     int b_mode;
+    int_mv nearest_sub8x8[2], near_sub8x8[2];
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
@@ -475,9 +478,11 @@
         if (b_mode == NEARESTMV || b_mode == NEARMV)
           for (ref = 0; ref < 1 + is_compound; ++ref)
             vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
-                                          &nearest[ref], &nearmv[ref]);
+                                          &nearest_sub8x8[ref],
+                                          &near_sub8x8[ref]);
 
-        if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+        if (!assign_mv(cm, b_mode, block, nearestmv,
+                       nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
@@ -499,9 +504,8 @@
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
-                                best, nearest, nearmv,
-                                is_compound, allow_hp, r);
+    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+                                nearestmv, nearmv, is_compound, allow_hp, r);
   }
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 69c569d..ec4dc14 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -39,11 +39,7 @@
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[INTRA_MODES]
-                    [INTRA_MODES]
-                    [INTRA_MODES];
 vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
-
 extern unsigned int active_section;
 #endif
 
@@ -414,9 +410,6 @@
         const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
         const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
         const int bm = m->bmi[i].as_mode;
-#ifdef ENTROPY_STATS
-        ++intra_mode_stats[A][L][bm];
-#endif
         write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
       }
     }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c011948..c1b9581 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -116,6 +116,7 @@
   unsigned int source_variance;
   unsigned int pred_sse[MAX_REF_FRAMES];
   int pred_mv_sad[MAX_REF_FRAMES];
+  int mode_sad[MAX_REF_FRAMES][INTER_MODES + 1];
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 6894f55..a66b9fb 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -27,19 +27,19 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_vaq.h"
 
-
 #define DBG_PRNT_SEGMAP 0
 
 
@@ -78,21 +78,19 @@
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
-/* activity_avg must be positive, or flat regions could get a zero weight
- *  (infinite lambda), which confounds analysis.
- * This also avoids the need for divide by zero checks in
- *  vp9_activity_masking().
- */
+// activity_avg must be positive, or flat regions could get a zero weight
+//  (infinite lambda), which confounds analysis.
+// This also avoids the need for divide by zero checks in
+//  vp9_activity_masking().
 #define ACTIVITY_AVG_MIN (64)
 
-/* Motion vector component magnitude threshold for defining fast motion. */
+// Motion vector component magnitude threshold for defining fast motion.
 #define FAST_MOTION_MV_THRESH (24)
 
-/* This is used as a reference when computing the source variance for the
- *  purposes of activity masking.
- * Eventually this should be replaced by custom no-reference routines,
- *  which will be faster.
- */
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
 static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128,
@@ -114,7 +112,6 @@
 
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(MACROBLOCK *x) {
-  unsigned int act;
   unsigned int sse;
   /* TODO: This could also be done over smaller areas (8x8), but that would
    *  require extensive changes elsewhere, as lambda is assumed to be fixed
@@ -123,13 +120,12 @@
    *  lambda using a non-linear combination (e.g., the smallest, or second
    *  smallest, etc.).
    */
-  act = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                          VP9_VAR_OFFS, 0, &sse);
-  act <<= 4;
-
-  /* If the region is flat, lower the activity some more. */
-  if (act < 8 << 12)
-    act = act < 5 << 12 ? act : 5 << 12;
+  unsigned int act = vp9_variance16x16(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       VP9_VAR_OFFS, 0, &sse) << 4;
+  // If the region is flat, lower the activity some more.
+  if (act < (8 << 12))
+    act = MIN(act, 5 << 12);
 
   return act;
 }
@@ -146,7 +142,7 @@
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
-    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+    const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
     // Or use and alternative.
     mb_activity = alt_activity_measure(x, use_dc_pred);
@@ -155,10 +151,7 @@
     mb_activity = tt_activity_measure(x);
   }
 
-  if (mb_activity < ACTIVITY_AVG_MIN)
-    mb_activity = ACTIVITY_AVG_MIN;
-
-  return mb_activity;
+  return MAX(mb_activity, ACTIVITY_AVG_MIN);
 }
 
 // Calculate an "average" mb activity value for the frame
@@ -340,13 +333,11 @@
   x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
   x->errorperbit += (x->errorperbit == 0);
 #else
-  int64_t a;
-  int64_t b;
-  int64_t act = *(x->mb_activity_ptr);
+  const int64_t act = *(x->mb_activity_ptr);
 
   // Apply the masking to the RD multiplier.
-  a = act + (2 * cpi->activity_avg);
-  b = (2 * act) + cpi->activity_avg;
+  const int64_t a = act + (2 * cpi->activity_avg);
+  const int64_t b = (2 * act) + cpi->activity_avg;
 
   x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
   x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
@@ -415,7 +406,7 @@
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
 
-  int mb_mode_index = ctx->best_mode_index;
+  const int mb_mode_index = ctx->best_mode_index;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -506,8 +497,8 @@
   } else {
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi)
-        && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
+    if (is_inter_block(mbmi) &&
+        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv[2];
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -560,7 +551,6 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -587,7 +577,7 @@
   mbmi = &xd->mi_8x8[0]->mbmi;
 
   // Set up destination pointers
-  setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
+  setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
 
   // Set up limit values for MV components
   // mv beyond the range do not produce new/different prediction block
@@ -611,15 +601,15 @@
   /* segment ID */
   if (seg->enabled) {
     if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
-      uint8_t *map = seg->update_map ? cpi->segmentation_map
-          : cm->last_frame_seg_map;
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
     vp9_mb_init_quantizer(cpi, x);
 
-    if (seg->enabled && cpi->seg0_cnt > 0
-        && !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME)
-        && vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) {
+    if (seg->enabled && cpi->seg0_cnt > 0 &&
+        !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME) &&
+        vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) {
       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
     } else {
       const int y = mb_row & ~3;
@@ -640,11 +630,11 @@
   }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col,
-                          int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                          int64_t best_rd) {
+static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, int mi_col,
+                             int *totalrate, int64_t *totaldist,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -688,13 +678,8 @@
   x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-    int energy;
-    if (bsize <= BLOCK_16X16) {
-      energy = x->mb_energy;
-    } else {
-      energy = vp9_block_energy(cpi, x, bsize);
-    }
-
+    const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp9_block_energy(cpi, x, bsize);
     xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
     vp9_mb_init_quantizer(cpi, x);
@@ -958,7 +943,7 @@
 static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
                                       int rows_left, int cols_left,
                                       int *bh, int *bw) {
-  if ((rows_left <= 0) || (cols_left <= 0)) {
+  if (rows_left <= 0 || cols_left <= 0) {
     return MIN(bsize, BLOCK_8X8);
   } else {
     for (; bsize > 0; --bsize) {
@@ -985,7 +970,7 @@
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
   int bw = num_8x8_blocks_wide_lookup[bsize];
 
@@ -1024,12 +1009,10 @@
 
   for (block_row = 0; block_row < 8; ++block_row) {
     for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
-      BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-      ptrdiff_t offset;
-
+      MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col];
+      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
       if (prev_mi) {
-        offset = prev_mi - cm->prev_mi;
+        const ptrdiff_t offset = prev_mi - cm->prev_mi;
         mi_8x8[block_row * mis + block_col] = cm->mi + offset;
         mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
       }
@@ -1037,14 +1020,14 @@
   }
 }
 
-static int sb_has_motion(VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
+static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   const int mis = cm->mode_info_stride;
   int block_row, block_col;
 
   if (cm->prev_mi) {
     for (block_row = 0; block_row < 8; ++block_row) {
       for (block_col = 0; block_col < 8; ++block_col) {
-        MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+        const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
         if (prev_mi) {
           if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
               abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
@@ -1056,6 +1039,132 @@
   return 0;
 }
 
+// TODO(jingning) This currently serves as a test framework for non-RD mode
+// decision. To be continued on optimizing the partition type decisions.
+static void pick_partition_type(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                                int do_recon) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  const int mi_stride = cm->mode_info_stride;
+  const int num_8x8_subsize = (num_8x8_blocks_wide_lookup[bsize] >> 1);
+  int i;
+  PARTITION_TYPE partition = PARTITION_NONE;
+  BLOCK_SIZE subsize;
+  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  int sub_rate[4] = {0};
+  int64_t sub_dist[4] = {0};
+  int mi_offset;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  partition = partition_lookup[b_width_log2(bsize)][bs_type];
+  subsize = get_subsize(bsize, partition);
+
+  if (bsize < BLOCK_8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
+    if (x->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  } else {
+    *(get_sb_partitioning(x, bsize)) = subsize;
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
+                       bsize, get_block_context(x, bsize), INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      *get_sb_index(x, subsize) = 0;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
+      if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) {
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *get_sb_index(x, subsize) = 1;
+        rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
+      }
+      *rate = sub_rate[0] + sub_rate[1];
+      *dist = sub_dist[0] + sub_dist[1];
+      break;
+    case PARTITION_VERT:
+      *get_sb_index(x, subsize) = 0;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
+      if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) {
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *get_sb_index(x, subsize) = 1;
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
+      }
+      *rate = sub_rate[0] + sub_rate[1];
+      *dist = sub_dist[1] + sub_dist[1];
+      break;
+    case PARTITION_SPLIT:
+      *get_sb_index(x, subsize) = 0;
+      pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, subsize,
+                          &sub_rate[0], &sub_dist[0], 0);
+
+      if ((mi_col + num_8x8_subsize) < cm->mi_cols) {
+        *get_sb_index(x, subsize) = 1;
+        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize, tp,
+                            mi_row, mi_col + num_8x8_subsize, subsize,
+                            &sub_rate[1], &sub_dist[1], 0);
+      }
+
+      if ((mi_row + num_8x8_subsize) < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 2;
+        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize * mi_stride, tp,
+                            mi_row + num_8x8_subsize, mi_col, subsize,
+                            &sub_rate[2], &sub_dist[2], 0);
+      }
+
+      if ((mi_col + num_8x8_subsize) < cm->mi_cols &&
+          (mi_row + num_8x8_subsize) < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 3;
+        mi_offset = num_8x8_subsize * mi_stride + num_8x8_subsize;
+        pick_partition_type(cpi, tile, mi_8x8 + mi_offset, tp,
+                            mi_row + num_8x8_subsize, mi_col + num_8x8_subsize,
+                            subsize, &sub_rate[3], &sub_dist[3], 0);
+      }
+
+      for (i = 0; i < 4; ++i) {
+        *rate += sub_rate[i];
+        *dist += sub_dist[i];
+      }
+
+      break;
+    default:
+      assert(0);
+  }
+
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                output_enabled, *rate);
+    }
+
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
+}
+
 static void rd_use_partition(VP9_COMP *cpi,
                              const TileInfo *const tile,
                              MODE_INFO **mi_8x8,
@@ -1065,12 +1174,12 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
+  const int bsl = b_width_log2(bsize);
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int ms = num_4x4_blocks_wide / 2;
-  int mh = num_4x4_blocks_high / 2;
-  int bss = (1 << bsl) / 4;
+  const int ms = num_4x4_blocks_wide / 2;
+  const int mh = num_4x4_blocks_high / 2;
+  const int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE subsize;
@@ -1092,7 +1201,6 @@
     return;
 
   partition = partition_lookup[bsl][bs_type];
-
   subsize = get_subsize(bsize, partition);
 
   if (bsize < BLOCK_8X8) {
@@ -1136,8 +1244,8 @@
         mi_row + (ms >> 1) < cm->mi_rows &&
         mi_col + (ms >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                    get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
 
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
@@ -1152,13 +1260,15 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    bsize, get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
         int rt = 0;
@@ -1166,8 +1276,8 @@
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1180,8 +1290,9 @@
       break;
     case PARTITION_VERT:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
         int rt = 0;
@@ -1189,8 +1300,8 @@
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1264,9 +1375,9 @@
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                    split_subsize, get_block_context(x, split_subsize),
-                    INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+                       split_subsize, get_block_context(x, split_subsize),
+                       INT64_MAX);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -1630,8 +1741,8 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                  get_block_context(x, bsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
+                     get_block_context(x, bsize), best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(cpi->above_seg_context,
@@ -1741,8 +1852,8 @@
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
     if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
@@ -1756,9 +1867,9 @@
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1794,8 +1905,8 @@
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
@@ -1808,9 +1919,9 @@
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1881,8 +1992,8 @@
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
-                  get_block_context(x, BLOCK_64X64), INT64_MAX);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
+                     get_block_context(x, BLOCK_64X64), INT64_MAX);
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
@@ -1894,6 +2005,34 @@
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
+static void encode_sb_row_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
+
+  cpi->sf.always_this_block_size = BLOCK_8X8;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+
+    vp9_zero(cpi->mb.pred_mv);
+
+    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+    pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rate, &dummy_dist, 1);
+  }
+}
+
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2120,7 +2259,11 @@
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
+#if 1
             encode_sb_row(cpi, &tile, mi_row, &tp);
+#else
+            encode_sb_row_rt(cpi, &tile, mi_row, &tp);
+#endif
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2262,16 +2405,14 @@
 }
 
 static int get_frame_type(VP9_COMP *cpi) {
-  int frame_type;
   if (frame_is_intra_only(&cpi->common))
-    frame_type = 0;
+    return 0;
   else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
-    frame_type = 3;
+    return 3;
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    frame_type = 1;
+    return 1;
   else
-    frame_type = 2;
-  return frame_type;
+    return 2;
 }
 
 static void select_tx_mode(VP9_COMP *cpi) {
@@ -2312,10 +2453,10 @@
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
-         == cm->ref_frame_sign_bias[GOLDEN_FRAME])
-        || (cm->ref_frame_sign_bias[ALTREF_FRAME]
-            == cm->ref_frame_sign_bias[LAST_FRAME])) {
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[LAST_FRAME])) {
       cm->allow_comp_inter_inter = 0;
     } else {
       cm->allow_comp_inter_inter = 1;
@@ -2398,8 +2539,7 @@
       int64_t pd = cpi->rd_tx_select_diff[i];
       int diff;
       if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
-                     2048 * (TX_SIZES - 1), 0);
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
       diff = (int) (pd / cm->MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@@ -2463,12 +2603,12 @@
   }
 }
 
-static void sum_intra_stats(VP9_COMMON *cm, const MODE_INFO *mi) {
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
   const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
   const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
-  ++cm->counts.uv_mode[y_mode][uv_mode];
+  ++counts->uv_mode[y_mode][uv_mode];
 
   if (bsize < BLOCK_8X8) {
     int idx, idy;
@@ -2476,9 +2616,9 @@
     const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
     for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide)
-        ++cm->counts.y_mode[0][mi->bmi[idy * 2 + idx].as_mode];
+        ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode];
   } else {
-    ++cm->counts.y_mode[size_group_lookup[bsize]][y_mode];
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
   }
 }
 
@@ -2503,7 +2643,7 @@
 #endif
 }
 
-static int get_zbin_mode_boost(MB_MODE_INFO *mbmi, int enabled) {
+static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
   if (enabled) {
     if (is_inter_block(mbmi)) {
       if (mbmi->mode == ZEROMV) {
@@ -2523,9 +2663,9 @@
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO **mi_8x8 = xd->mi_8x8;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -2565,10 +2705,11 @@
   }
 
   if (!is_inter_block(mbmi)) {
+    mbmi->skip_coeff = 1;
     vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
     vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
     if (output_enabled)
-      sum_intra_stats(cm, mi);
+      sum_intra_stats(&cm->counts, mi);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -2583,6 +2724,7 @@
   if (!is_inter_block(mbmi)) {
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else if (!x->skip) {
+    mbmi->skip_coeff = 1;
     vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
     vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 21bc588..4bef675 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -451,6 +451,9 @@
     ctx->tl[plane][j] = p->eobs[block] > 0;
   }
 
+  if (p->eobs[block])
+    *(args->skip_coeff) = 0;
+
   if (x->skip_encode || p->eobs[block] == 0)
     return;
 
@@ -474,7 +477,6 @@
       assert(0 && "Invalid transform size");
   }
 }
-
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
@@ -499,7 +501,8 @@
 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -511,7 +514,8 @@
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   if (!x->skip_recode)
     vp9_subtract_sb(x, bsize);
@@ -655,12 +659,15 @@
     default:
       assert(0);
   }
+  if (*eob)
+    *(args->skip_coeff) = 0;
 }
 
 void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
 
   foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
                                      &arg);
@@ -668,7 +675,8 @@
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip_coeff};
   foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
 }
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index cb872a7..207d573 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,6 +24,7 @@
 struct encode_b_args {
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
+  unsigned char *skip_coeff;
 };
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7c4ca63..0a5af18 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -49,9 +49,6 @@
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
-
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
   *a = *b;
@@ -269,20 +266,15 @@
 // harder frames.
 static double calculate_modified_err(VP9_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  struct twopass_rc *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
-  const double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_error;
+  double modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
+                                           DOUBLE_DIVIDE_CHECK(av_err),
+                                       cpi->oxcf.two_pass_vbrbias / 100.0);
 
-  modified_error =  av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
-                                 this_err > av_err ? POW1 : POW2);
-
-  if (modified_error < cpi->twopass.modified_error_min)
-    modified_error = cpi->twopass.modified_error_min;
-  else if (modified_error > cpi->twopass.modified_error_max)
-    modified_error = cpi->twopass.modified_error_max;
-
-  return modified_error;
+  return fclamp(modified_error,
+                twopass->modified_error_min, twopass->modified_error_max);
 }
 
 static const double weight_table[256] = {
@@ -353,13 +345,14 @@
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(VP9_COMP *cpi) {
   int64_t max_bits =
-     ((int64_t)cpi->rc.av_per_frame_bandwidth *
-      (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+    ((int64_t)cpi->rc.av_per_frame_bandwidth *
+     (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
 
   if (max_bits < 0)
-    return 0;
-  if (max_bits >= INT_MAX)
-    return INT_MAX;
+    max_bits = 0;
+  else if (max_bits > cpi->rc.max_frame_bandwidth)
+    max_bits = cpi->rc.max_frame_bandwidth;
+
   return (int)max_bits;
 }
 
@@ -714,7 +707,7 @@
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
           this_error = motion_error;
-          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
+          vp9_set_mbmode_and_mvs(xd, NEWMV, &mv.as_mv);
           xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
@@ -793,58 +786,48 @@
 
   vp9_clear_system_state();  // __asm emms;
   {
-    double weight = 0.0;
-
     FIRSTPASS_STATS fps;
 
-    fps.frame      = cm->current_video_frame;
-    fps.intra_error = (double)(intra_error >> 8);
-    fps.coded_error = (double)(coded_error >> 8);
-    fps.sr_coded_error = (double)(sr_coded_error >> 8);
-    weight = simple_weight(cpi->Source);
-
-
-    if (weight < 0.1)
-      weight = 0.1;
-
-    fps.ssim_weighted_pred_err = fps.coded_error * weight;
-
-    fps.pcnt_inter  = 0.0;
+    fps.frame = cm->current_video_frame;
+    fps.intra_error = intra_error >> 8;
+    fps.coded_error = coded_error >> 8;
+    fps.sr_coded_error = sr_coded_error >> 8;
+    fps.ssim_weighted_pred_err = fps.coded_error *
+                                     MAX(0.1, simple_weight(cpi->Source));
+    fps.pcnt_inter = 0.0;
     fps.pcnt_motion = 0.0;
-    fps.MVr        = 0.0;
-    fps.mvr_abs     = 0.0;
-    fps.MVc        = 0.0;
-    fps.mvc_abs     = 0.0;
-    fps.MVrv       = 0.0;
-    fps.MVcv       = 0.0;
-    fps.mv_in_out_count  = 0.0;
+    fps.MVr = 0.0;
+    fps.mvr_abs = 0.0;
+    fps.MVc = 0.0;
+    fps.mvc_abs = 0.0;
+    fps.MVrv = 0.0;
+    fps.MVcv = 0.0;
+    fps.mv_in_out_count = 0.0;
     fps.new_mv_count = 0.0;
-    fps.count      = 1.0;
+    fps.count = 1.0;
 
-    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+    fps.pcnt_inter = (double)intercount / cm->MBs;
+    fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
+    fps.pcnt_neutral = (double)neutral_count / cm->MBs;
 
     if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / (double)mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
-      fps.MVc = (double)sum_mvc / (double)mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
-                 (double)mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
-                 (double)mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) /
+                     mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) /
+                     mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-
       fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
     }
 
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(cpi->source->ts_end
-                            - cpi->source->ts_start);
+    fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
     cpi->twopass.this_frame_stats = fps;
@@ -963,13 +946,13 @@
   int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
+  RATE_CONTROL *const rc = &cpi->rc;
 
-  double section_err = fpstats->coded_error / fpstats->count;
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
+  const double section_err = fpstats->coded_error / fpstats->count;
+  const double err_per_mb = section_err / num_mbs;
 
   if (section_target_bandwitdh <= 0)
-    return cpi->rc.worst_quality;          // Highest value allowed
+    return rc->worst_quality;          // Highest value allowed
 
   target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
                               ? (512 * section_target_bandwitdh) / num_mbs
@@ -977,15 +960,11 @@
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (q = cpi->rc.best_quality; q < cpi->rc.worst_quality; q++) {
-    int bits_per_mb_at_this_q;
-
-    err_correction_factor = calc_correction_factor(err_per_mb,
-                                                   ERR_DIVISOR, 0.5, 0.90, q);
-
-    bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                               err_correction_factor);
-
+  for (q = rc->best_quality; q < rc->worst_quality; q++) {
+    const double err_correction_factor = calc_correction_factor(err_per_mb,
+                                             ERR_DIVISOR, 0.5, 0.90, q);
+    const int bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                                         err_correction_factor);
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
@@ -1179,8 +1158,7 @@
       if (EOF == input_stats(&cpi->twopass, &tmp_next_frame))
         break;
 
-      zz_inter =
-        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
+      zz_inter = (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
       if (zz_inter < 0.999)
         break;
     }
@@ -1550,6 +1528,7 @@
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
+  RATE_CONTROL *const rc = &cpi->rc;
 
   cpi->twopass.gf_group_bits = 0;
 
@@ -1566,7 +1545,7 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // The error score / cost of this frame has already been accounted for.
-  if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
   // Motion breakout threshold for loop below depends on image size.
@@ -1580,14 +1559,14 @@
   // interval to spread the cost of the GF.
   //
   active_max_gf_interval =
-    12 + ((int)vp9_convert_qindex_to_q(cpi->rc.last_q[INTER_FRAME]) >> 5);
+    12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
 
-  if (active_max_gf_interval > cpi->rc.max_gf_interval)
-    active_max_gf_interval = cpi->rc.max_gf_interval;
+  if (active_max_gf_interval > rc->max_gf_interval)
+    active_max_gf_interval = rc->max_gf_interval;
 
   i = 0;
   while ((i < cpi->twopass.static_scene_max_gf_interval) &&
-         (i < cpi->rc.frames_to_key)) {
+         (i < rc->frames_to_key)) {
     i++;    // Increment the loop counter
 
     // Accumulate error score of frames in this gf group
@@ -1630,8 +1609,7 @@
     }
 
     // Calculate a boost number for this frame
-    boost_score +=
-      (decay_accumulator *
+    boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
@@ -1659,14 +1637,14 @@
   cpi->twopass.gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
   // Don't allow a gf too near the next kf
-  if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
+  if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
+    while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
       i++;
 
       if (EOF == input_stats(&cpi->twopass, this_frame))
         break;
 
-      if (i < cpi->rc.frames_to_key) {
+      if (i < rc->frames_to_key) {
         mod_frame_err = calculate_modified_err(cpi, this_frame);
         gf_group_err += mod_frame_err;
       }
@@ -1686,18 +1664,18 @@
 #endif
 
   // Set the interval until the next gf.
-  if (cpi->common.frame_type == KEY_FRAME || cpi->rc.source_alt_ref_active)
-    cpi->rc.baseline_gf_interval = i - 1;
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+    rc->baseline_gf_interval = i - 1;
   else
-    cpi->rc.baseline_gf_interval = i;
+    rc->baseline_gf_interval = i;
 
   // Should we use the alternate reference frame
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
       // for real scene cuts (not forced kfs) dont allow arf very near kf.
-      (cpi->rc.next_key_frame_forced ||
-        (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
+      (rc->next_key_frame_forced ||
+        (i <= (rc->frames_to_key - MIN_GF_INTERVAL))) &&
       ((next_frame.pcnt_inter > 0.75) ||
        (next_frame.pcnt_second_ref > 0.5)) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1705,25 +1683,25 @@
       (boost_score > 100)) {
 
     // Alternative boost calculation for alt ref
-    cpi->rc.gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
-                                    &b_boost);
-    cpi->rc.source_alt_ref_pending = 1;
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+                                   &b_boost);
+    rc->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
     // Set the ARF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
+      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
     }
 #endif
   } else {
-    cpi->rc.gfu_boost = (int)boost_score;
-    cpi->rc.source_alt_ref_pending = 0;
+    rc->gfu_boost = (int)boost_score;
+    rc->source_alt_ref_pending = 0;
 #if CONFIG_MULTIPLE_ARF
     // Set the GF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, cpi->rc.baseline_gf_interval - 1, 2, 0, 0);
+      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
       assert(cpi->new_frame_coding_order_period ==
-             cpi->rc.baseline_gf_interval);
+             rc->baseline_gf_interval);
     }
 #endif
   }
@@ -1783,32 +1761,28 @@
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
   // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits >
-      (int64_t)max_bits * cpi->rc.baseline_gf_interval)
-    cpi->twopass.gf_group_bits =
-        (int64_t)max_bits * cpi->rc.baseline_gf_interval;
+  if (cpi->twopass.gf_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    cpi->twopass.gf_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
 
   // Reset the file position
   reset_fpf_position(&cpi->twopass, start_pos);
 
   // Assign  bits to the arf or gf.
-  for (i = 0;
-      i <= (cpi->rc.source_alt_ref_pending &&
-            cpi->common.frame_type != KEY_FRAME);
-      ++i) {
+  for (i = 0; i <= (rc->source_alt_ref_pending &&
+                    cpi->common.frame_type != KEY_FRAME); ++i) {
     int allocation_chunks;
-    int q = cpi->rc.last_q[INTER_FRAME];
+    int q = rc->last_q[INTER_FRAME];
     int gf_bits;
 
-    int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
+    int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
+    boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
-    if (cpi->rc.source_alt_ref_pending && i == 0)
-      allocation_chunks = ((cpi->rc.baseline_gf_interval + 1) * 100) + boost;
+    if (rc->source_alt_ref_pending && i == 0)
+      allocation_chunks = ((rc->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks = (cpi->rc.baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1825,11 +1799,10 @@
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
-    if (cpi->rc.baseline_gf_interval < 1 ||
-        mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
-      double alt_gf_grp_bits =
-        (double)cpi->twopass.kf_group_bits  *
-        (mod_frame_err * (double)cpi->rc.baseline_gf_interval) /
+    if (rc->baseline_gf_interval < 1 ||
+        mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
+      double alt_gf_grp_bits = (double)cpi->twopass.kf_group_bits  *
+        (mod_frame_err * (double)rc->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
       int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
@@ -1856,10 +1829,11 @@
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
     }
-    if (i == 1 || (!cpi->rc.source_alt_ref_pending
-        && (cpi->common.frame_type != KEY_FRAME))) {
+    if (i == 1 ||
+        (!rc->source_alt_ref_pending &&
+         (cpi->common.frame_type != KEY_FRAME))) {
       // Per frame bit target for this frame
-      cpi->rc.per_frame_bandwidth = gf_bits;
+      rc->per_frame_bandwidth = gf_bits;
     }
   }
 
@@ -1877,7 +1851,7 @@
     // the remaining bits amoung the other frames/
     // For normal GFs remove the score for the GF itself unless this is
     // also a key frame in which case it has already been accounted for.
-    if (cpi->rc.source_alt_ref_pending) {
+    if (rc->source_alt_ref_pending) {
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err - mod_frame_err;
     } else if (cpi->common.frame_type != KEY_FRAME) {
       cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
@@ -1894,9 +1868,8 @@
     // This condition could fail if there are two kfs very close together
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
-    if (cpi->rc.baseline_gf_interval >= 3) {
-      const int boost = cpi->rc.source_alt_ref_pending ?
-          b_boost : cpi->rc.gfu_boost;
+    if (rc->baseline_gf_interval >= 3) {
+      const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
 
       if (boost >= 150) {
         int alt_extra_bits;
@@ -1915,7 +1888,7 @@
     zero_stats(&sectionstats);
     reset_fpf_position(&cpi->twopass, start_pos);
 
-    for (i = 0; i < cpi->rc.baseline_gf_interval; i++) {
+    for (i = 0; i < rc->baseline_gf_interval; i++) {
       input_stats(&cpi->twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -1933,12 +1906,9 @@
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
 static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int target_frame_size;
-
   double modified_err;
   double err_fraction;
-
-  // Max for a single frame.
-  int max_bits = frame_max_bits(cpi);
+  const int max_bits = frame_max_bits(cpi);  // Max for a single frame.
 
   // Calculate modified prediction error used in bit allocation.
   modified_err = calculate_modified_err(cpi, this_frame);
@@ -1954,15 +1924,8 @@
 
   // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
   // the top end.
-  if (target_frame_size < 0) {
-    target_frame_size = 0;
-  } else {
-    if (target_frame_size > max_bits)
-      target_frame_size = max_bits;
-
-    if (target_frame_size > cpi->twopass.gf_group_bits)
-      target_frame_size = (int)cpi->twopass.gf_group_bits;
-  }
+  target_frame_size = clamp(target_frame_size, 0,
+                            MIN(max_bits, (int)cpi->twopass.gf_group_bits));
 
   // Adjust error and bits remaining.
   cpi->twopass.gf_group_error_left -= (int64_t)modified_err;
@@ -2002,7 +1965,10 @@
        cpi->rc.frames_to_key == 0 ||
        (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2019,7 +1985,10 @@
       cpi->rc.frames_to_key == 0 ||
       (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2048,6 +2017,7 @@
 
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
+  RATE_CONTROL *rc = &cpi->rc;
 
   double this_frame_intra_error;
   double this_frame_coded_error;
@@ -2062,7 +2032,7 @@
   vp9_clear_system_state();
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    cpi->rc.active_worst_quality = cpi->oxcf.cq_level;
+    rc->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cpi->common.current_video_frame == 0) {
     // Special case code for first frame.
     int section_target_bandwidth =
@@ -2071,9 +2041,9 @@
     tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
                            section_target_bandwidth);
 
-    cpi->rc.active_worst_quality = tmp_q;
-    cpi->rc.ni_av_qi = tmp_q;
-    cpi->rc.avg_q = vp9_convert_qindex_to_q(tmp_q);
+    rc->active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
     // Limit the maxq value returned subsequently.
     // This increases the risk of overspend or underspend if the initial
@@ -2090,7 +2060,7 @@
   this_frame_coded_error = this_frame.coded_error;
 
   // keyframe and section processing !
-  if (cpi->rc.frames_to_key == 0) {
+  if (rc->frames_to_key == 0) {
     // Define next KF group and assign bits to it
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
@@ -2099,7 +2069,7 @@
   }
 
   // Is this a GF / ARF (Note that a KF is always also a GF)
-  if (cpi->rc.frames_till_gf_update_due == 0) {
+  if (rc->frames_till_gf_update_due == 0) {
     // Define next gf group and assign bits to it
     this_frame_copy = this_frame;
 
@@ -2122,7 +2092,7 @@
         cpi->enable_encode_breakout = 2;
     }
 
-    cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
   } else {
     // Otherwise this is an ordinary frame
@@ -2143,8 +2113,8 @@
   }
 
   // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = (int)(cpi->rc.per_frame_bandwidth
-                                * cpi->output_framerate);
+  cpi->target_bandwidth = (int)(rc->per_frame_bandwidth *
+                                   cpi->output_framerate);
   if (cpi->target_bandwidth < 0)
     cpi->target_bandwidth = 0;
 
@@ -2199,10 +2169,9 @@
 
       // Cumulative effect of decay in prediction quality
       if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+        decay_accumulator *= local_next_frame.pcnt_inter;
       else
-        decay_accumulator =
-            decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
       // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
 
@@ -2261,6 +2230,8 @@
   double kf_group_coded_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
+  RATE_CONTROL *const rc = &cpi->rc;
+
   vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
@@ -2269,15 +2240,15 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // is this a forced key frame by interval
-  cpi->rc.this_key_frame_forced = cpi->rc.next_key_frame_forced;
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
 
   // Clear the alt ref active flag as this can never be active on a key frame
-  cpi->rc.source_alt_ref_active = 0;
+  rc->source_alt_ref_active = 0;
 
   // Kf is always a gf so clear frames till next gf counter
-  cpi->rc.frames_till_gf_update_due = 0;
+  rc->frames_till_gf_update_due = 0;
 
-  cpi->rc.frames_to_key = 1;
+  rc->frames_to_key = 1;
 
   // Take a copy of the initial frame details
   first_frame = *this_frame;
@@ -2329,14 +2300,14 @@
         break;
 
       // Step on to the next frame
-      cpi->rc.frames_to_key++;
+      rc->frames_to_key++;
 
       // If we don't have a real key frame within the next two
       // forcekeyframeevery intervals then break out of the loop.
-      if (cpi->rc.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+      if (rc->frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
     } else {
-      cpi->rc.frames_to_key++;
+      rc->frames_to_key++;
     }
     i++;
   }
@@ -2345,11 +2316,11 @@
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural
   // interval is between 1x and 2x
-  if (cpi->oxcf.auto_key
-      && cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
+  if (cpi->oxcf.auto_key &&
+      rc->frames_to_key > (int)cpi->key_frame_frequency) {
     FIRSTPASS_STATS tmp_frame;
 
-    cpi->rc.frames_to_key /= 2;
+    rc->frames_to_key /= 2;
 
     // Copy first frame details
     tmp_frame = first_frame;
@@ -2362,7 +2333,7 @@
     kf_group_coded_err = 0;
 
     // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < cpi->rc.frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; i++) {
       // Accumulate kf group errors
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
       kf_group_intra_err += tmp_frame.intra_error;
@@ -2371,11 +2342,11 @@
       // Load a the next frame's stats
       input_stats(&cpi->twopass, &tmp_frame);
     }
-    cpi->rc.next_key_frame_forced = 1;
+    rc->next_key_frame_forced = 1;
   } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
-    cpi->rc.next_key_frame_forced = 1;
+    rc->next_key_frame_forced = 1;
   } else {
-    cpi->rc.next_key_frame_forced = 0;
+    rc->next_key_frame_forced = 0;
   }
 
   // Special case for the last key frame of the file
@@ -2406,7 +2377,7 @@
                                             cpi->twopass.modified_error_left));
 
     // Clip based on maximum per frame rate defined by the user.
-    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->rc.frames_to_key;
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
     if (cpi->twopass.kf_group_bits > max_grp_bits)
       cpi->twopass.kf_group_bits = max_grp_bits;
   } else {
@@ -2422,7 +2393,7 @@
   loop_decay_rate = 1.00;       // Starting decay rate
 
   // Scan through the kf group collating various stats.
-  for (i = 0; i < cpi->rc.frames_to_key; i++) {
+  for (i = 0; i < rc->frames_to_key; i++) {
     double r;
 
     if (EOF == input_stats(&cpi->twopass, &next_frame))
@@ -2436,7 +2407,7 @@
     }
 
     // For the first few frames collect data to decide kf boost.
-    if (i <= (cpi->rc.max_gf_interval * 2)) {
+    if (i <= (rc->max_gf_interval * 2)) {
       if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
         r = (IIKFACTOR2 * next_frame.intra_error /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
@@ -2465,16 +2436,15 @@
     zero_stats(&sectionstats);
     reset_fpf_position(&cpi->twopass, start_position);
 
-    for (i = 0; i < cpi->rc.frames_to_key; i++) {
+    for (i = 0; i < rc->frames_to_key; i++) {
       input_stats(&cpi->twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
 
     avg_stats(&sectionstats);
 
-    cpi->twopass.section_intra_rating = (int)
-      (sectionstats.intra_error
-      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+    cpi->twopass.section_intra_rating = (int) (sectionstats.intra_error /
+        DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
   // Reset the first pass file position
@@ -2486,15 +2456,15 @@
     int allocation_chunks;
     int alt_kf_bits;
 
-    if (kf_boost < (cpi->rc.frames_to_key * 3))
-      kf_boost = (cpi->rc.frames_to_key * 3);
+    if (kf_boost < (rc->frames_to_key * 3))
+      kf_boost = (rc->frames_to_key * 3);
 
     if (kf_boost < 300)  // Min KF boost
       kf_boost = 300;
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
-    cpi->rc.kf_boost = kf_boost;
+    rc->kf_boost = kf_boost;
     cpi->twopass.kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
     // We do three calculations for kf size.
@@ -2511,10 +2481,10 @@
     // care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
       allocation_chunks =
-        ((cpi->rc.frames_to_key - 1) * 10) + kf_boost;
+        ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
       allocation_chunks =
-        ((cpi->rc.frames_to_key - 1) * 100) + kf_boost;
+        ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
     // Prevent overflow
@@ -2524,22 +2494,21 @@
       allocation_chunks /= divisor;
     }
 
-    cpi->twopass.kf_group_bits =
-        (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0
+           : cpi->twopass.kf_group_bits;
 
     // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits =
-        (int)((double)kf_boost *
+    cpi->twopass.kf_bits = (int)((double)kf_boost *
               ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
 
     // If the key frame is actually easier than the average for the
     // kf group (which does sometimes happen... eg a blank intro frame)
     // Then use an alternate calculation based on the kf error score
     // which should give a smaller key frame.
-    if (kf_mod_err < kf_group_err / cpi->rc.frames_to_key) {
+    if (kf_mod_err < kf_group_err / rc->frames_to_key) {
       double  alt_kf_grp_bits =
         ((double)cpi->twopass.bits_left *
-         (kf_mod_err * (double)cpi->rc.frames_to_key) /
+         (kf_mod_err * (double)rc->frames_to_key) /
          DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
 
       alt_kf_bits = (int)((double)kf_boost *
@@ -2552,8 +2521,7 @@
     // Else if it is much harder than other frames in the group make sure
     // it at least receives an allocation in keeping with its relative
     // error score
-      alt_kf_bits =
-        (int)((double)cpi->twopass.bits_left *
+      alt_kf_bits = (int)((double)cpi->twopass.bits_left *
               (kf_mod_err /
                DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
 
@@ -2565,7 +2533,7 @@
     cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
 
     // Peer frame bit target for this frame
-    cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
+    rc->per_frame_bandwidth = cpi->twopass.kf_bits;
     // Convert to a per second bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
                                   cpi->output_framerate);
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index f3ddd39..c500986 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -23,7 +23,7 @@
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               const MV *ref_mv,
-                                              int_mv *dst_mv,
+                                              MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
@@ -35,7 +35,7 @@
   const int tmp_col_max = x->mv_col_max;
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
-  int_mv ref_full;
+  MV ref_full;
 
   // Further step/diamond searches as necessary
   int step_param = cpi->sf.reduce_first_step_size +
@@ -44,12 +44,12 @@
 
   vp9_set_mv_search_range(x, ref_mv);
 
-  ref_full.as_mv.col = ref_mv->col >> 3;
-  ref_full.as_mv.row = ref_mv->row >> 3;
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
-                            0, &v_fn_ptr, 0, ref_mv, &dst_mv->as_mv);
+  best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit,
+                            0, &v_fn_ptr, 0, ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -57,15 +57,14 @@
     int distortion;
     unsigned int sse;
     best_err = cpi->find_fractional_mv_step(
-        x,
-        &dst_mv->as_mv, ref_mv,
+        x, dst_mv, ref_mv,
         cpi->common.allow_high_precision_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);
   }
 
-  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
+  vp9_set_mbmode_and_mvs(xd, NEWMV, dst_mv);
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
   best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                           xd->plane[0].dst.buf, xd->plane[0].dst.stride,
@@ -96,7 +95,7 @@
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, &ref_mv->as_mv, &tmp_mv,
+  tmp_err = do_16x16_motion_iteration(cpi, &ref_mv->as_mv, &tmp_mv.as_mv,
                                       mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
@@ -110,7 +109,7 @@
     int_mv zero_ref_mv, tmp_mv;
 
     zero_ref_mv.as_int = 0;
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv.as_mv, &tmp_mv,
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv.as_mv, &tmp_mv.as_mv,
                                         mb_row, mb_col);
     if (tmp_err < err) {
       dst_mv->as_int = tmp_mv.as_int;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index c199dff..88d527a 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1371,23 +1371,18 @@
                           int *mvcost[2],
                           const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *bestaddress;
   MV *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   int bestsad = INT_MAX;
   int r, c;
-
-  uint8_t *check_here;
   int thissad;
-
   int ref_row = ref_mv->row;
   int ref_col = ref_mv->col;
-
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
   int col_min = ref_col - distance;
@@ -1401,8 +1396,7 @@
   fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
+  bestaddress = &in_what[ref_row * in_what_stride + ref_col];
 
   best_mv->row = ref_row;
   best_mv->col = ref_col;
@@ -1421,8 +1415,8 @@
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
+    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
     this_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
 
     for (c = col_min; c < col_max; c++) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
@@ -1460,31 +1454,24 @@
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
                           int *mvcost[2], const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *bestaddress;
   MV *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
-
-  uint8_t *check_here;
   unsigned int thissad;
-
   int ref_row = ref_mv->row;
   int ref_col = ref_mv->col;
-
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
-
   unsigned int sad_array[3];
   MV fcenter_mv;
-
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
@@ -1492,8 +1479,7 @@
   fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
+  bestaddress = &in_what[ref_row * in_what_stride + ref_col];
 
   best_mv->row = ref_row;
   best_mv->col = ref_col;
@@ -1512,8 +1498,8 @@
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
+    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
     this_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
     c = col_min;
 
     while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
@@ -1582,28 +1568,22 @@
                           int *mvjcost, int *mvcost[2],
                           const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *bestaddress;
   MV *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
-
-  uint8_t *check_here;
   unsigned int thissad;
-
   int ref_row = ref_mv->row;
   int ref_col = ref_mv->col;
-
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
-
   DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
   unsigned int sad_array[3];
   MV fcenter_mv;
@@ -1615,8 +1595,7 @@
   fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
+  bestaddress = &in_what[ref_row * in_what_stride + ref_col];
 
   best_mv->row = ref_row;
   best_mv->col = ref_col;
@@ -1635,8 +1614,8 @@
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
+    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
     this_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
     c = col_min;
 
     while ((c + 7) < col_max) {
@@ -1924,65 +1903,54 @@
     return INT_MAX;
 }
 
-/* This function is called when we do joint motion search in comp_inter_inter
- * mode.
- */
-int vp9_refining_search_8p_c(MACROBLOCK *x,
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2], const MV *center_mv,
                              const uint8_t *second_pred, int w, int h) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
-      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+                           {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
   int i, j;
-  int this_row_offset, this_col_offset;
 
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->row * xd->plane[0].pre[0].stride) +
-                          ref_mv->col;
-  uint8_t *check_here;
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what = xd->plane[0].pre[0].buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
+                                             ref_mv->col];
   unsigned int thissad;
   MV this_mv;
-  unsigned int bestsad = INT_MAX;
-  MV fcenter_mv;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
 
-  int *mvjsadcost = x->nmvjointsadcost;
+  const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.row = center_mv->row >> 3;
-  fcenter_mv.col = center_mv->col >> 3;
-
   /* Get compound pred by averaging two pred blocks. */
-  bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
-                         second_pred, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv,
-                     mvjsadcost, mvsadcost, error_per_bit);
+  unsigned int bestsad = fn_ptr->sdaf(what, what_stride,
+                                      best_address, in_what_stride,
+                                      second_pred, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
 
-  for (i = 0; i < search_range; i++) {
+  for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; j++) {
-      this_row_offset = ref_mv->row + neighbors[j].row;
-      this_col_offset = ref_mv->col + neighbors[j].col;
+      this_mv.row = ref_mv->row + neighbors[j].row;
+      this_mv.col = ref_mv->col + neighbors[j].col;
 
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-            best_address;
+      if ((this_mv.col > x->mv_col_min) &&
+          (this_mv.col < x->mv_col_max) &&
+          (this_mv.row > x->mv_row_min) &&
+          (this_mv.row < x->mv_row_max)) {
+        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
+                                                this_mv.col];
 
-        /* Get compound block and use it to calculate SAD. */
         thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
                                second_pred, bestsad);
-
         if (thissad < bestsad) {
-          this_mv.row = this_row_offset;
-          this_mv.col = this_col_offset;
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
           if (thissad < bestsad) {
@@ -1998,8 +1966,7 @@
     } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-          neighbors[best_site].col;
+      best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col];
     }
   }
 
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index b3d8975..cd2ec5d 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -123,7 +123,7 @@
                                        int *mvjcost, int *mvcost[2],
                                        const MV *center_mv);
 
-int vp9_refining_search_8p_c(MACROBLOCK *x,
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 669fe0a..59d36ee 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -59,6 +59,11 @@
 #define DISABLE_COMPOUND_SPLIT    0x18
 #define LAST_AND_INTRA_SPLIT_ONLY 0x1E
 
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
 #if CONFIG_INTERNAL_STATS
 extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest, int lumamask,
@@ -88,12 +93,6 @@
 #endif
 
 
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[INTRA_MODES]
-                           [INTRA_MODES]
-                           [INTRA_MODES];
-#endif
-
 #ifdef MODE_STATS
 extern void init_tx_count_stats();
 extern void write_tx_count_stats();
@@ -1093,6 +1092,9 @@
 };
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int64_t vbr_max_bits;
+
   if (framerate < 0.1)
     framerate = 30;
 
@@ -1109,6 +1111,19 @@
   cpi->rc.min_frame_bandwidth = MAX(cpi->rc.min_frame_bandwidth,
                                     FRAME_OVERHEAD_BITS);
 
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  //
+  vbr_max_bits = ((int64_t)cpi->rc.av_per_frame_bandwidth *
+                  (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+  cpi->rc.max_frame_bandwidth =
+    MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
   // Set Maximum gf/arf interval
   cpi->rc.max_gf_interval = 16;
 
@@ -1844,9 +1859,6 @@
   cpi->diamond_search_sad = vp9_diamond_search_sad;
   cpi->refining_search_sad = vp9_refining_search_sad;
 
-  // make sure frame 1 is okay
-  cpi->error_bins[0] = cm->MBs;
-
   /* vp9_init_quantizer() is first called here. Add check in
    * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
@@ -1960,41 +1972,6 @@
     }
 #endif
 
-#ifdef ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("vp9_modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
-                     "[INTRA_MODES] =\n{\n");
-
-      for (i = 0; i < INTRA_MODES; i++) {
-        fprintf(fmode, "    { // Above Mode :  %d\n", i);
-
-        for (j = 0; j < INTRA_MODES; j++) {
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < INTRA_MODES; k++) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, // left_mode %d\n", j);
-        }
-
-        fprintf(fmode, "    },\n");
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
-
 #if defined(SECTIONBITS_OUTPUT)
 
     if (0) {
@@ -2193,27 +2170,33 @@
   return 0;
 }
 
+static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(VP9_COMP *cpi,
+                                VP9_REFFRAME ref_frame_flag) {
+  MV_REFERENCE_FRAME ref_frame = NONE;
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_frame = LAST_FRAME;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_frame = GOLDEN_FRAME;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_frame = ALTREF_FRAME;
+
+  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
 int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  YV12_BUFFER_CONFIG *cfg;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    cfg = get_ref_frame_buffer(cpi, LAST_FRAME);
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    cfg = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    cfg = get_ref_frame_buffer(cpi, ALTREF_FRAME);
-  else
+  VP9_COMP *const cpi = (VP9_COMP *)ptr;
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(cfg, sd);
+    return 0;
+  } else {
     return -1;
-
-  vp8_yv12_copy_frame(cfg, sd);
-
-  return 0;
+  }
 }
 
 int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
   VP9_COMMON *cm = &cpi->common;
 
   if (index < 0 || index >= REF_FRAMES)
@@ -2225,23 +2208,14 @@
 
 int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
-  else
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(sd, cfg);
+    return 0;
+  } else {
     return -1;
-
-  vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]);
-
-  return 0;
+  }
 }
 
 int vp9_update_entropy(VP9_PTR comp, int update) {
@@ -2449,10 +2423,14 @@
   int force_recode = 0;
   VP9_COMMON *cm = &cpi->common;
 
-  // Is frame recode allowed at all
-  // Yes if either recode mode 1 is selected or mode two is selected
-  // and the frame is a key frame. golden frame or alt_ref_frame
-  if ((cpi->sf.recode_loop == 1) ||
+  // Special case trap if maximum allowed frame size exceeded.
+  if (cpi->rc.projected_frame_size > cpi->rc.max_frame_bandwidth) {
+    force_recode = 1;
+
+  // Is frame recode allowed.
+  // Yes if either recode mode 1 is selected or mode 2 is selected
+  // and the frame is a key frame, golden frame or alt_ref_frame
+  } else if ((cpi->sf.recode_loop == 1) ||
       ((cpi->sf.recode_loop == 2) &&
        ((cm->frame_type == KEY_FRAME) ||
         cpi->refresh_golden_frame ||
@@ -2625,16 +2603,19 @@
   recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-        "%6d %6d %5d %5d %5d %10d %10.3f"
-        "%10.3f %8d %10d %10d %10d\n",
+    fprintf(f, "%10u %10d %10d %10d %10d %10d "
+        "%10"PRId64" %10"PRId64" %10d "
+        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%6d %6d %5d %5d %5d "
+        "%10"PRId64" %10.3lf"
+        "%10lf %8u %10d %10d %10d\n",
         cpi->common.current_video_frame, cpi->rc.this_frame_target,
-        cpi->rc.projected_frame_size, 0,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
         (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
-        (int)cpi->rc.total_target_vs_actual,
-        (int)(cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
-        (int)cpi->rc.total_actual_bits, cm->base_qindex,
+        cpi->rc.total_target_vs_actual,
+        (cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
+        cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex),
         (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
         vp9_convert_qindex_to_q(cpi->rc.active_worst_quality), cpi->rc.avg_q,
@@ -2642,9 +2623,9 @@
         vp9_convert_qindex_to_q(cpi->cq_target_quality),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
         cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
-        (int)cpi->twopass.bits_left,
+        cpi->twopass.bits_left,
         cpi->twopass.total_left_stats.coded_error,
-        (double)cpi->twopass.bits_left /
+        cpi->twopass.bits_left /
             (1 + cpi->twopass.total_left_stats.coded_error),
         cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
         cpi->twopass.kf_zeromotion_pct);
@@ -2728,20 +2709,23 @@
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
-    vp9_save_coding_context(cpi);
-    cpi->dummy_packing = 1;
-    vp9_pack_bitstream(cpi, dest, size);
-    cpi->rc.projected_frame_size = (*size) << 3;
-    vp9_restore_coding_context(cpi);
+    if (cpi->sf.recode_loop != 0) {
+      vp9_save_coding_context(cpi);
+      cpi->dummy_packing = 1;
+      vp9_pack_bitstream(cpi, dest, size);
+      cpi->rc.projected_frame_size = (*size) << 3;
+      vp9_restore_coding_context(cpi);
 
-    if (frame_over_shoot_limit == 0)
-      frame_over_shoot_limit = 1;
+      if (frame_over_shoot_limit == 0)
+        frame_over_shoot_limit = 1;
+    }
 
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
     } else {
-      // Special case handling for forced key frames
-      if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
+      if ((cm->frame_type == KEY_FRAME) &&
+           cpi->rc.this_key_frame_forced &&
+           (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth)) {
         int last_q = *q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
@@ -2780,7 +2764,7 @@
         loop = *q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          *q, top_index, bottom_index)) {
+          *q, MAX(q_high, top_index), bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
         int last_q = *q;
@@ -2791,6 +2775,10 @@
 
         // Frame is too large
         if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (cpi->rc.projected_frame_size >= cpi->rc.max_frame_bandwidth)
+            q_high = cpi->rc.worst_quality;
+
           // Raise Qlow as to at least the current value
           q_low = *q < q_high ? *q + 1 : q_high;
 
@@ -2804,12 +2792,12 @@
             vp9_rc_update_rate_correction_factors(cpi, 0);
 
             *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
-                                   bottom_index, top_index);
+                                   bottom_index, MAX(q_high, top_index));
 
             while (*q < q_low && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi, 0);
               *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
-                                     bottom_index, top_index);
+                                     bottom_index, MAX(q_high, top_index));
               retries++;
             }
           }
@@ -2855,7 +2843,9 @@
       }
     }
 
-    if (cpi->rc.is_src_frame_alt_ref)
+    // Special case for overlay frame.
+    if (cpi->rc.is_src_frame_alt_ref &&
+        (cpi->rc.projected_frame_size < cpi->rc.max_frame_bandwidth))
       loop = 0;
 
     if (loop) {
@@ -3699,7 +3689,8 @@
       *dest = *cpi->common.frame_to_show;
       dest->y_width = cpi->common.width;
       dest->y_height = cpi->common.height;
-      dest->uv_height = cpi->common.height / 2;
+      dest->uv_width = cpi->common.width >> cpi->common.subsampling_x;
+      dest->uv_height = cpi->common.height >> cpi->common.subsampling_y;
       ret = 0;
     } else {
       ret = -1;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index a5be0f4..a665bf8 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -442,9 +442,10 @@
   unsigned int source_alt_ref_active;
   unsigned int is_src_frame_alt_ref;
 
-  int per_frame_bandwidth;  // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;  // Average frame size target for clip
-  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int per_frame_bandwidth;        // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;     // Average frame size target for clip
+  int min_frame_bandwidth;        // Minimum allocation used for any frame
+  int max_frame_bandwidth;        // Maximum burst rate allowed for a frame.
 
   int ni_av_qi;
   int ni_tot_qi;
@@ -623,7 +624,6 @@
   int ref_frame_flags;
 
   SPEED_FEATURES sf;
-  int error_bins[1024];
 
   unsigned int max_mv_magnitude;
   int mv_step_param;
@@ -740,9 +740,6 @@
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
-                                      [SWITCHABLE_FILTERS];
-
   unsigned int tx_stepdown_count[TX_SIZES];
 
   int initial_width;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 58078ad..4ca85ee 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -20,56 +20,6 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "./vpx_scale_rtcd.h"
 
-void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc, int fraction) {
-  const int height = src_ybc->y_height;
-  const int stride = src_ybc->y_stride;
-  const int offset = stride * ((height >> 5) * 16 - 8);
-  const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4;
-
-  assert(src_ybc->y_stride == dst_ybc->y_stride);
-  vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset,
-             stride * (lines_to_copy + 16));
-}
-
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
-                                YV12_BUFFER_CONFIG *dest, int Fraction) {
-  int i, j;
-  int Total = 0;
-  int srcoffset, dstoffset;
-  uint8_t *src = source->y_buffer;
-  uint8_t *dst = dest->y_buffer;
-
-  int linestocopy = (source->y_height >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-
-  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
-  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
-
-  src += srcoffset;
-  dst += dstoffset;
-
-  // Loop through the raw Y plane and reconstruction data summing the square
-  // differences.
-  for (i = 0; i < linestocopy; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
-
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
-  }
-
-  return Total;
-}
-
 // Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
   int min_filter_level;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
new file mode 100644
index 0000000..17d1f59
--- /dev/null
+++ b/vp9/encoder/vp9_pickmode.c
@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "vp9/common/vp9_pragmas.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_common.h"
+
+static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                     const TileInfo *const tile,
+                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                     int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  int bestsme = INT_MAX;
+  int further_steps, step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[0];
+  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  int i;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  int buf_offset;
+  int stride = xd->plane[0].pre[0].stride;
+
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp9_set_mv_search_range(x, &ref_mv.as_mv);
+
+  // TODO(jingning) exploiting adaptive motion search control in non-RD
+  // mode decision too.
+  step_param = 6;
+  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+  for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
+    if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+      tmp_mv->as_int = INVALID_MV;
+
+      if (scaled_ref_frame) {
+        int i;
+        for (i = 0; i < MAX_MB_PLANE; i++)
+          xd->plane[i].pre[0] = backup_yv12[i];
+      }
+      return INT_MAX;
+    }
+  }
+
+  mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                   sadpb, further_steps, 1,
+                                   &cpi->fn_ptr[bsize],
+                                   &ref_mv.as_mv, tmp_mv);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  // TODO(jingning) This step can be merged into full pixel search step in the
+  // re-designed log-diamond search
+  buf_offset = tmp_mv->as_mv.row * stride + tmp_mv->as_mv.col;
+
+  // Find sad for current vector.
+  bestsme = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].pre[0].buf + buf_offset,
+                                   stride, 0x7fffffff);
+
+  // scale to 1/8 pixel resolution
+  tmp_mv->as_mv.row = tmp_mv->as_mv.row << 3;
+  tmp_mv->as_mv.col = tmp_mv->as_mv.col << 3;
+
+  // calculate the bit cost on motion vector
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+
+  return bestsme;
+}
+
+// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+// this needs various further optimizations. to be continued..
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize,
+                            PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  int64_t this_rd;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  x->skip = 0;
+  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    x->skip = 1;
+
+  // initialize mode decisions
+  *returnrate = INT_MAX;
+  vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = NONE;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+                             ref_frame, block_size, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    int rate_mv = 0;
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // Select prediction reference frames.
+    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+
+
+    x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
+        full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
+
+    if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+      continue;
+
+    clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
+    clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+
+    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+      int rate = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                   [INTER_OFFSET(this_mode)];
+      int64_t dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] *
+                      x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        mbmi->mode = this_mode;
+        mbmi->ref_frame[0] = ref_frame;
+        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+      }
+    }
+  }
+
+  // TODO(jingning) sub-pixel motion search, if NEWMV is chosen
+
+  // TODO(jingning) intra prediction search, if the best SAD is above a certain
+  // threshold.
+
+  // store mode decisions
+  ctx->mic = *xd->mi_8x8[0];
+
+  return INT64_MAX;
+}
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
new file mode 100644
index 0000000..32750fa
--- /dev/null
+++ b/vp9/encoder/vp9_pickmode.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_onyx_int.h"
+
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const struct TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize,
+                            PICK_MODE_CONTEXT *ctx);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index aefef53..4d2d43a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -258,25 +258,27 @@
 // Update the buffer level: leaky bucket model.
 void vp9_update_buffer_level(VP9_COMP *const cpi, int encoded_frame_size) {
   VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame) {
-    cpi->rc.bits_off_target -= encoded_frame_size;
+    rc->bits_off_target -= encoded_frame_size;
   } else {
-    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
-        encoded_frame_size;
+    rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size;
   }
   // Clip the buffer level to the maximum specified buffer size.
-  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
-    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+  if (rc->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+    rc->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
-  cpi->rc.buffer_level = cpi->rc.bits_off_target;
+  rc->buffer_level = rc->bits_off_target;
 }
 
 int vp9_drop_frame(VP9_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
   if (!cpi->oxcf.drop_frames_water_mark) {
     return 0;
   } else {
-    if (cpi->rc.buffer_level < 0) {
+    if (rc->buffer_level < 0) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
@@ -284,23 +286,23 @@
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
           cpi->oxcf.optimal_buffer_level / 100);
-      if ((cpi->rc.buffer_level > drop_mark) &&
-          (cpi->rc.decimation_factor > 0)) {
-        --cpi->rc.decimation_factor;
-      } else if (cpi->rc.buffer_level <= drop_mark &&
-          cpi->rc.decimation_factor == 0) {
-        cpi->rc.decimation_factor = 1;
+      if ((rc->buffer_level > drop_mark) &&
+          (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark &&
+          rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
       }
-      if (cpi->rc.decimation_factor > 0) {
-        if (cpi->rc.decimation_count > 0) {
-          --cpi->rc.decimation_count;
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
           return 1;
         } else {
-          cpi->rc.decimation_count = cpi->rc.decimation_factor;
+          rc->decimation_count = rc->decimation_factor;
           return 0;
         }
       } else {
-        cpi->rc.decimation_count = 0;
+        rc->decimation_count = 0;
         return 0;
       }
     }
@@ -314,63 +316,65 @@
   // If buffer is below the optimal level, let the active_worst_quality go from
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
-  int active_worst_quality = cpi->rc.active_worst_quality;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int active_worst_quality = rc->active_worst_quality;
   // Maximum limit for down adjustment, ~20%.
   int max_adjustment_down = active_worst_quality / 5;
   // Buffer level below which we push active_worst to worst_quality.
-  int critical_level = cpi->oxcf.optimal_buffer_level >> 2;
+  int critical_level = oxcf->optimal_buffer_level >> 2;
   int adjustment = 0;
   int buff_lvl_step = 0;
-  if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
+  if (rc->buffer_level > oxcf->optimal_buffer_level) {
     // Adjust down.
     if (max_adjustment_down) {
-      buff_lvl_step = (int)((cpi->oxcf.maximum_buffer_size -
-          cpi->oxcf.optimal_buffer_level) / max_adjustment_down);
-      if (buff_lvl_step) {
-        adjustment = (int)((cpi->rc.buffer_level -
-            cpi->oxcf.optimal_buffer_level) / buff_lvl_step);
-      }
+      buff_lvl_step = (int)((oxcf->maximum_buffer_size -
+          oxcf->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                            buff_lvl_step);
       active_worst_quality -= adjustment;
     }
-  } else if (cpi->rc.buffer_level > critical_level) {
+  } else if (rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (cpi->oxcf.optimal_buffer_level - critical_level);
+      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment =
-            (cpi->rc.worst_quality - cpi->rc.avg_frame_qindex[INTER_FRAME]) *
-            (cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level) /
-            buff_lvl_step;
+        adjustment = (rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                         (oxcf->optimal_buffer_level - rc->buffer_level) /
+                             buff_lvl_step;
       }
-      active_worst_quality = cpi->rc.avg_frame_qindex[INTER_FRAME] + adjustment;
+      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
     }
   } else {
     // Set to worst_quality if buffer is below critical level.
-    active_worst_quality = cpi->rc.worst_quality;
+    active_worst_quality = rc->worst_quality;
   }
   return active_worst_quality;
 }
 
 // Adjust target frame size with respect to the buffering constraints:
 static int target_size_from_buffer_level(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
   int this_frame_target = cpi->rc.this_frame_target;
   int percent_low = 0;
   int percent_high = 0;
-  int one_percent_bits = (int)(1 + cpi->oxcf.optimal_buffer_level / 100);
-  if (cpi->rc.buffer_level < cpi->oxcf.optimal_buffer_level) {
-    percent_low = (int)((cpi->oxcf.optimal_buffer_level - cpi->rc.buffer_level)
-        / one_percent_bits);
-    if (percent_low > cpi->oxcf.under_shoot_pct) {
-      percent_low = cpi->oxcf.under_shoot_pct;
-    }
+  int one_percent_bits = (int)(1 + oxcf->optimal_buffer_level / 100);
+  if (rc->buffer_level < oxcf->optimal_buffer_level) {
+    percent_low = (int)((oxcf->optimal_buffer_level - rc->buffer_level) /
+                      one_percent_bits);
+    if (percent_low > oxcf->under_shoot_pct)
+      percent_low = oxcf->under_shoot_pct;
+
     // Lower the target bandwidth for this frame.
     this_frame_target -= (this_frame_target * percent_low) / 200;
-  } else  if (cpi->rc.buffer_level > cpi->oxcf.optimal_buffer_level) {
-    percent_high = (int)((cpi->rc.buffer_level - cpi->oxcf.optimal_buffer_level)
-        / one_percent_bits);
-    if (percent_high > cpi->oxcf.over_shoot_pct) {
-      percent_high = cpi->oxcf.over_shoot_pct;
-    }
+  } else  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+    percent_high = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                     one_percent_bits);
+    if (percent_high > oxcf->over_shoot_pct)
+      percent_high = oxcf->over_shoot_pct;
+
     // Increase the target bandwidth for this frame.
     this_frame_target += (this_frame_target * percent_high) / 200;
   }
@@ -378,25 +382,27 @@
 }
 
 static void calc_pframe_target_size(VP9_COMP *const cpi) {
-  int min_frame_target = MAX(cpi->rc.min_frame_bandwidth,
-                             cpi->rc.av_per_frame_bandwidth >> 5);
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  int min_frame_target = MAX(rc->min_frame_bandwidth,
+                             rc->av_per_frame_bandwidth >> 5);
   if (cpi->refresh_alt_ref_frame) {
     // Special alt reference frame case
     // Per frame bit target for the alt ref frame
-    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
-    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+    rc->per_frame_bandwidth = cpi->twopass.gf_bits;
+    rc->this_frame_target = rc->per_frame_bandwidth;
   } else {
     // Normal frames (gf and inter).
-    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+    rc->this_frame_target = rc->per_frame_bandwidth;
     // Set target frame size based on buffer level, for 1 pass CBR.
-    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    if (cpi->pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
       // Need to decide how low min_frame_target should be for 1-pass CBR.
       // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
-      min_frame_target = MAX(cpi->rc.av_per_frame_bandwidth >> 4,
+      min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
-      cpi->rc.this_frame_target = target_size_from_buffer_level(cpi);
+      rc->this_frame_target = target_size_from_buffer_level(cpi);
       // Adjust qp-max based on buffer level.
-      cpi->rc.active_worst_quality =
+      rc->active_worst_quality =
           adjust_active_worst_quality_from_buffer_level(cpi);
     }
   }
@@ -407,25 +413,24 @@
   // not capable of recovering all the extra bits we have spent in the KF or GF,
   // then the remainder will have to be recovered over a longer time span via
   // other buffer / rate control mechanisms.
-  if (cpi->rc.this_frame_target < min_frame_target) {
-    cpi->rc.this_frame_target = min_frame_target;
-  }
+  if (rc->this_frame_target < min_frame_target)
+    rc->this_frame_target = min_frame_target;
 
   // Adjust target frame size for Golden Frames:
   if (cpi->refresh_golden_frame) {
     // If we are using alternate ref instead of gf then do not apply the boost
     // It will instead be applied to the altref update
     // Jims modified boost
-    if (!cpi->rc.source_alt_ref_active) {
+    if (!rc->source_alt_ref_active) {
       // The spend on the GF is defined in the two pass code
       // for two pass encodes
-      cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
+      rc->this_frame_target = rc->per_frame_bandwidth;
     } else {
       // If there is an active ARF at this location use the minimum
       // bits on this frame even if it is a constructed arf.
       // The active maximum quantizer insures that an appropriate
       // number of bits will be spent if needed for constructed ARFs.
-      cpi->rc.this_frame_target = 0;
+      rc->this_frame_target = 0;
     }
   }
 }
@@ -576,36 +581,34 @@
 }
 
 int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
-                                      int *bottom_index,
-                                      int *top_index) {
+                                      int *bottom_index, int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
   int active_best_quality;
-  int active_worst_quality = cpi->rc.active_worst_quality;
+  int active_worst_quality = rc->active_worst_quality;
   int q;
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = cpi->rc.best_quality;
+    active_best_quality = rc->best_quality;
 #if !CONFIG_MULTIPLE_ARF
     // Handle the special case for key frames forced when we have75 reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (cpi->rc.this_key_frame_forced) {
-      int delta_qindex;
-      int qindex = cpi->rc.last_boosted_qindex;
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-
-      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                        (last_boosted_q * 0.75));
-      active_best_quality = MAX(qindex + delta_qindex,
-                                cpi->rc.best_quality);
-    } else if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
-      // not first frame of one pass
+      int delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                            (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
+      // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
       active_best_quality = get_active_quality(active_worst_quality,
-                                               cpi->rc.kf_boost,
+                                               rc->kf_boost,
                                                kf_low, kf_high,
                                                kf_low_motion_minq,
                                                kf_high_motion_minq);
@@ -631,29 +634,29 @@
     active_best_quality = active_worst_quality
         + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
 #endif
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
+  } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
 
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
-    if (cpi->rc.frames_since_key > 1 &&
-        cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = cpi->rc.avg_frame_qindex[INTER_FRAME];
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
     } else {
       q = active_worst_quality;
     }
     // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+    if (oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) {
       if (q < cpi->cq_target_quality)
         q = cpi->cq_target_quality;
-      if (cpi->rc.frames_since_key > 1) {
-        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+      if (rc->frames_since_key > 1) {
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
                                                  gf_low, gf_high,
                                                  afq_low_motion_minq,
                                                  afq_high_motion_minq);
       } else {
-        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+        active_best_quality = get_active_quality(q, rc->gfu_boost,
                                                  gf_low, gf_high,
                                                  gf_low_motion_minq,
                                                  gf_high_motion_minq);
@@ -661,46 +664,46 @@
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
-    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    } else if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cpi->cq_target_quality;
       } else {
-        if (cpi->rc.frames_since_key > 1) {
+        if (rc->frames_since_key > 1) {
           active_best_quality = get_active_quality(
-              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              q, rc->gfu_boost, gf_low, gf_high,
               afq_low_motion_minq, afq_high_motion_minq);
         } else {
           active_best_quality = get_active_quality(
-              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              q, rc->gfu_boost, gf_low, gf_high,
               gf_low_motion_minq, gf_high_motion_minq);
         }
       }
     } else {
       active_best_quality = get_active_quality(
-          q, cpi->rc.gfu_boost, gf_low, gf_high,
+          q, rc->gfu_boost, gf_low, gf_high,
           gf_low_motion_minq, gf_high_motion_minq);
     }
   } else {
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
       active_best_quality = cpi->cq_target_quality;
     } else {
       if (cpi->pass == 0 &&
-          cpi->rc.avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+          rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
         // 1-pass: for now, use the average Q for the active_best, if its lower
         // than active_worst.
-        active_best_quality = inter_minq[cpi->rc.avg_frame_qindex[INTER_FRAME]];
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
       else
         active_best_quality = inter_minq[active_worst_quality];
 
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
-      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+      if ((oxcf->end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (active_best_quality < cpi->cq_target_quality)) {
         // If we are strongly undershooting the target rate in the last
         // frames then use the user passed in cq value not the auto
         // cq value.
-        if (cpi->rc.rolling_actual_bits < cpi->rc.min_frame_bandwidth)
-          active_best_quality = cpi->oxcf.cq_level;
+        if (rc->rolling_actual_bits < rc->min_frame_bandwidth)
+          active_best_quality = oxcf->cq_level;
         else
           active_best_quality = cpi->cq_target_quality;
       }
@@ -708,14 +711,14 @@
   }
 
   // Clip the active best and worst quality values to limits
-  if (active_worst_quality > cpi->rc.worst_quality)
-    active_worst_quality = cpi->rc.worst_quality;
+  if (active_worst_quality > rc->worst_quality)
+    active_worst_quality = rc->worst_quality;
 
-  if (active_best_quality < cpi->rc.best_quality)
-    active_best_quality = cpi->rc.best_quality;
+  if (active_best_quality < rc->best_quality)
+    active_best_quality = rc->best_quality;
 
-  if (active_best_quality > cpi->rc.worst_quality)
-    active_best_quality = cpi->rc.worst_quality;
+  if (active_best_quality > rc->worst_quality)
+    active_best_quality = rc->worst_quality;
 
   if (active_worst_quality < active_best_quality)
     active_worst_quality = active_best_quality;
@@ -725,29 +728,34 @@
 
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !cpi->rc.this_key_frame_forced) {
-    if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced) {
+    if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
       *top_index =
           (active_worst_quality + active_best_quality * 3) / 4;
     }
-  } else if (!cpi->rc.is_src_frame_alt_ref &&
-             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+  } else if (!rc->is_src_frame_alt_ref &&
+             (oxcf->end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (active_worst_quality + active_best_quality) / 2;
   }
 #endif
 
-  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+  if (oxcf->end_usage == USAGE_CONSTANT_QUALITY) {
     q = active_best_quality;
   // Special case code to try and match quality with forced key frames
-  } else if ((cm->frame_type == KEY_FRAME) && cpi->rc.this_key_frame_forced) {
-    q = cpi->rc.last_boosted_qindex;
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
   } else {
-    q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                           active_best_quality, active_worst_quality);
-    if (q > *top_index)
-      q = *top_index;
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (cpi->rc.this_frame_target >= cpi->rc.max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
   }
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
@@ -766,12 +774,11 @@
     printf("frame:%d q:%d\n", cm->current_video_frame, q);
   }
 #endif
-  assert(*top_index <= cpi->rc.worst_quality &&
-         *top_index >= cpi->rc.best_quality);
-  assert(*bottom_index <= cpi->rc.worst_quality &&
-         *bottom_index >= cpi->rc.best_quality);
-  assert(q <= cpi->rc.worst_quality &&
-         q >= cpi->rc.best_quality);
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
   return q;
 }
 
@@ -810,6 +817,11 @@
     *frame_under_shoot_limit -= 200;
     if (*frame_under_shoot_limit < 0)
       *frame_under_shoot_limit = 0;
+
+    // Clip to maximum allowed rate for a frame.
+    if (*frame_over_shoot_limit > cpi->rc.max_frame_bandwidth) {
+      *frame_over_shoot_limit = cpi->rc.max_frame_bandwidth;
+    }
   }
 }
 
@@ -822,6 +834,10 @@
   else
     calc_pframe_target_size(cpi);
 
+  // Clip the frame target to the maximum allowed value.
+  if (cpi->rc.this_frame_target > cpi->rc.max_frame_bandwidth)
+    cpi->rc.this_frame_target = cpi->rc.max_frame_bandwidth;
+
   // Target rate per SB64 (including partial SB64s.
   cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
                              (cpi->common.width * cpi->common.height);
@@ -843,24 +859,26 @@
 }
 
 static void update_golden_frame_stats(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
     // this frame refreshes means next frames don't unless specified by user
-    cpi->rc.frames_since_golden = 0;
+    rc->frames_since_golden = 0;
 
-    if (!cpi->rc.source_alt_ref_pending)
-      cpi->rc.source_alt_ref_active = 0;
+    if (!rc->source_alt_ref_pending)
+      rc->source_alt_ref_active = 0;
 
     // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
 
   } else if (!cpi->refresh_alt_ref_frame) {
     // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
 
-    cpi->rc.frames_since_golden++;
+    rc->frames_since_golden++;
   }
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 64d8cf8..fa6b362 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -37,8 +37,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define INVALID_MV 0x80008000
-
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
@@ -113,14 +111,6 @@
 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
-#define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
-
-#define MV_COST_WEIGHT      108
-#define MV_COST_WEIGHT_SUB  120
-
 static int raster_block_offset(BLOCK_SIZE plane_bsize,
                                int raster_block, int stride) {
   const int bw = b_width_log2(plane_bsize);
@@ -134,27 +124,27 @@
   return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
 
-static void fill_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *const cm = &c->common;
+static void fill_mode_costs(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  FRAME_CONTEXT *const fc = &cm->fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++)
     for (j = 0; j < INTRA_MODES; j++)
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+      vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
 
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
-                  vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
-                  vp9_intra_mode_tree);
+  vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+  vp9_cost_tokens(x->intra_uv_mode_cost[1],
+                  fc->uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+  vp9_cost_tokens(x->intra_uv_mode_cost[0],
+                  vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    cm->fc.switchable_interp_prob[i],
+    vp9_cost_tokens((int *)x->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
 }
 
@@ -198,9 +188,9 @@
   // This is to make it easier to resolve the impact of experimental changes
   // to the quantizer tables.
   for (i = 0; i < QINDEX_RANGE; i++) {
-    sad_per_bit16lut[i] =
-      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
+    const double q = vp9_convert_qindex_to_q(i);
+    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
@@ -234,36 +224,30 @@
 static void set_block_thresholds(VP9_COMP *cpi) {
   int i, bsize, segment_id;
   VP9_COMMON *cm = &cpi->common;
+  SPEED_FEATURES *sf = &cpi->sf;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
-    int q;
-    int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-    segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
-    q = compute_rd_thresh_factor(segment_qindex);
+    const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
+                                            cm->base_qindex) + cm->y_dc_delta_q,
+                             0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
       // Threshold here seem unecessarily harsh but fine given actual
       // range of values used for cpi->sf.thresh_mult[]
-      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
 
-      for (i = 0; i < MAX_MODES; ++i) {
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[segment_id][bsize][i] =
-              cpi->sf.thresh_mult[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
-        }
-      }
+      for (i = 0; i < MAX_MODES; ++i)
+        cpi->rd_threshes[segment_id][bsize][i] =
+            sf->thresh_mult[i] < thresh_max ? sf->thresh_mult[i] * t / 4
+                                            : INT_MAX;
 
       for (i = 0; i < MAX_REFS; ++i) {
-        if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
-              cpi->sf.thresh_mult_sub8x8[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
-        }
+        cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
+            sf->thresh_mult_sub8x8[i] < thresh_max
+                ? sf->thresh_mult_sub8x8[i] * t / 4
+                : INT_MAX;
       }
     }
   }
@@ -271,6 +255,7 @@
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
   int qindex, i;
 
   vp9_clear_system_state();  // __asm emms;
@@ -284,35 +269,32 @@
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
   cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
 
-  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+  x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO + (x->errorperbit == 0);
 
   vp9_set_speed_features(cpi);
 
-  cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
-                              cm->frame_type != KEY_FRAME) ?
-                              0 : 1;
+  x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                         cm->frame_type != KEY_FRAME) ? 0 : 1;
 
   set_block_thresholds(cpi);
 
-  fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+  fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
+    vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
                     vp9_partition_tree);
 
-  /*rough estimate for costing*/
   fill_mode_costs(cpi);
 
   if (!frame_is_intra_only(cm)) {
-    vp9_build_nmv_cost_table(
-        cpi->mb.nmvjointcost,
-        cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
-        &cm->fc.nmvc,
-        cm->allow_high_precision_mv, 1, 1);
+    vp9_build_nmv_cost_table(x->nmvjointcost,
+                             cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                         : x->nmvcost,
+                             &cm->fc.nmvc,
+                             cm->allow_high_precision_mv, 1, 1);
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+      vp9_cost_tokens((int *)x->inter_mode_cost[i],
                       cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
   }
 }
@@ -464,8 +446,8 @@
   BLOCK_SIZE bs;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
-  const int height = 4 << num_4x4_blocks_high_lookup[bsize];
+  const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
   int64_t dist_sum = 0;
   const int t = 4 << tx_size;
@@ -640,7 +622,9 @@
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args encode_args = {x, NULL};
+  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct encode_b_args encode_args = {x, NULL, &mbmi->skip_coeff};
+
   int64_t rd1, rd2, rd;
 
   if (args->skip)
@@ -1418,9 +1402,10 @@
   }
 }
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
-  x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
-  x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
+void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
+                            const MV *mv) {
+  xd->mi_8x8[0]->mbmi.mode = mode;
+  xd->mi_8x8[0]->mbmi.mv[0].as_mv = *mv;
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2138,8 +2123,10 @@
     max_mv = MAX(max_mv,
                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
     // only need to check zero mv once
-    if (!this_mv.as_int && zero_seen)
+    if (!this_mv.as_int && zero_seen) {
+      x->mode_sad[ref_frame][i] = x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)];
       continue;
+    }
     zero_seen = zero_seen || !this_mv.as_int;
 
     row_offset = this_mv.as_mv.row >> 3;
@@ -2150,6 +2137,9 @@
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
                                            ref_y_ptr, ref_y_stride,
                                            0x7fffffff);
+    x->mode_sad[ref_frame][i] = this_sad;
+    if (this_mv.as_int == 0)
+      x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] = this_sad;
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -2158,6 +2148,12 @@
     }
   }
 
+  if (!zero_seen)
+    x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] =
+        cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                    ref_y_buffer, ref_y_stride,
+                                    0x7fffffff);
+
   // Note the index of the mv that worked best in the reference list.
   x->mv_best_ref_index[ref_frame] = best_index;
   x->max_mv_context[ref_frame] = max_mv;
@@ -2317,7 +2313,7 @@
             frame_type, block_size);
 }
 
-static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
   int fb = get_ref_frame_idx(cpi, ref_frame);
   int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
@@ -2355,7 +2351,7 @@
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
 
-  YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
 
   int_mv pred_mv[3];
   pred_mv[0] = mbmi->ref_mvs[ref][0];
@@ -2503,8 +2499,8 @@
   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   int last_besterr[2] = {INT_MAX, INT_MAX};
   YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
-    get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   };
 
   for (ref = 0; ref < 2; ++ref) {
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 5732c2b..696cf6b 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,6 +19,16 @@
   (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
+
+#define INVALID_MV 0x80008000
+
 struct TileInfo;
 
 int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
@@ -36,6 +46,8 @@
                             int_mv frame_near_mv[MAX_REF_FRAMES],
                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
 
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
@@ -60,8 +72,8 @@
 
 void vp9_init_me_luts();
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
-                            MB_PREDICTION_MODE mb, int_mv *mv);
+void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
+                            const MV *mv);
 
 void vp9_get_entropy_contexts(TX_SIZE tx_size,
     ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 970a27a..b04e3fe 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -175,6 +175,18 @@
   set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, aoff, loff);
 }
 
+static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+                             int16_t extra, uint8_t token,
+                             uint8_t skip_eob_node,
+                             unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -186,9 +198,9 @@
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0, rc = 0;
+  int c = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const int eob = p->eobs[block];
+  int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
@@ -197,51 +209,53 @@
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
 
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
-  assert((!type && !plane) || (type && plane));
-
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                                    pd->left_context + loff);
+                           pd->left_context + loff);
   so = get_scan(xd, tx_size, type, block);
   scan = so->scan;
   nb = so->neighbors;
-
   c = 0;
-  do {
-    const int band = band_translate[c];
-    int token;
+  while (c < eob) {
     int v = 0;
-    rc = scan[c];
-    if (c)
+    int skip_eob = 0;
+    v = qcoeff_ptr[scan[c]];
+
+    while (!v) {
+      add_token(&t, coef_probs[type][ref][band[c]][pt], 0, ZERO_TOKEN, skip_eob,
+                counts[type][ref][band[c]][pt]);
+
+      cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] +=
+          !skip_eob;
+
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
       pt = get_coef_context(nb, token_cache, c);
-    if (c < eob) {
-      v = qcoeff_ptr[rc];
-      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
-
-      t->extra = vp9_dct_value_tokens_ptr[v].extra;
-      token    = vp9_dct_value_tokens_ptr[v].token;
-    } else {
-      token = EOB_TOKEN;
+      v = qcoeff_ptr[scan[c]];
     }
+    add_token(&t, coef_probs[type][ref][band[c]][pt],
+              vp9_dct_value_tokens_ptr[v].extra,
+              vp9_dct_value_tokens_ptr[v].token, skip_eob,
+              counts[type][ref][band[c]][pt]);
 
-    t->token = token;
-    t->context_tree = coef_probs[type][ref][band][pt];
-    t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+    cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt] += !skip_eob;
 
-    assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-
-    ++counts[type][ref][band][pt][token];
-    if (!t->skip_eob_node)
-      ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
-
-    token_cache[rc] = vp9_pt_energy_class[token];
-    ++t;
-  } while (c < eob && ++c < seg_eob);
+    token_cache[scan[c]] =
+        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token(&t, coef_probs[type][ref][band[c]][pt], 0, EOB_TOKEN, 0,
+              counts[type][ref][band[c]][pt]);
+    ++cpi->common.counts.eob_branch[tx_size][type][ref][band[c]][pt];
+  }
 
   *tp = t;
 
@@ -285,8 +299,6 @@
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
-
-  mbmi->skip_coeff = sb_is_skippable(&cpi->mb, bsize);
   if (mbmi->skip_coeff) {
     if (!dry_run)
       cm->counts.mbskip[ctx][1] += skip_inc;
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 0d79346..9ea0f54 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -43,6 +43,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
 VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
@@ -55,6 +56,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
 VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
diff --git a/vpxdec.c b/vpxdec.c
index 97ac4bb..731feed 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -40,13 +40,12 @@
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);
   uint32_t fourcc;
-  uint32_t fourcc_mask;
 } ifaces[] = {
 #if CONFIG_VP8_DECODER
-  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC_MASK, 0x00FFFFFF},
+  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC},
 #endif
 #if CONFIG_VP9_DECODER
-  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC_MASK, 0x00FFFFFF},
+  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC},
 #endif
 };
 
@@ -132,6 +131,21 @@
 };
 #endif
 
+static int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+                           FilterMode mode) {
+  assert(src->fmt == VPX_IMG_FMT_I420);
+  assert(dst->fmt == VPX_IMG_FMT_I420);
+  return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
+                   src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U],
+                   src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V],
+                   src->d_w, src->d_h,
+                   dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y],
+                   dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U],
+                   dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V],
+                   dst->d_w, dst->d_h,
+                   mode);
+}
+
 void usage_exit() {
   int i;
 
@@ -167,11 +181,10 @@
   exit(EXIT_FAILURE);
 }
 
-static int raw_read_frame(struct VpxInputContext *input_ctx, uint8_t **buffer,
+static int raw_read_frame(FILE *infile, uint8_t **buffer,
                           size_t *bytes_read, size_t *buffer_size) {
   char raw_hdr[RAW_FRAME_HDR_SZ];
   size_t frame_size = 0;
-  FILE *infile = input_ctx->file;
 
   if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
     if (!feof(infile))
@@ -221,10 +234,10 @@
       return webm_read_frame(input->webm_ctx,
                              buf, bytes_in_buffer, buffer_size);
     case FILE_TYPE_RAW:
-      return raw_read_frame(input->vpx_input_ctx,
+      return raw_read_frame(input->vpx_input_ctx->file,
                             buf, bytes_in_buffer, buffer_size);
     case FILE_TYPE_IVF:
-      return ivf_read_frame(input->vpx_input_ctx,
+      return ivf_read_frame(input->vpx_input_ctx->file,
                             buf, bytes_in_buffer, buffer_size);
     default:
       return 1;
@@ -250,11 +263,51 @@
   return out;
 }
 
-void out_put(void *out, const uint8_t *buf, unsigned int len, int do_md5) {
-  if (do_md5) {
-    MD5Update(out, buf, len);
-  } else {
-    (void) fwrite(buf, 1, len, out);
+static int get_image_plane_width(int plane, const vpx_image_t *img) {
+  return (plane > 0 && img->x_chroma_shift > 0) ?
+             (img->d_w + 1) >> img->x_chroma_shift :
+             img->d_w;
+}
+
+static int get_image_plane_height(int plane, const vpx_image_t *img) {
+  return (plane > 0 &&  img->y_chroma_shift > 0) ?
+             (img->d_h + 1) >> img->y_chroma_shift :
+             img->d_h;
+}
+
+static void update_image_md5(const vpx_image_t *img, const int planes[3],
+                             MD5Context *md5) {
+  int i, y;
+
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = get_image_plane_width(plane, img);
+    const int h = get_image_plane_height(plane, img);
+
+    for (y = 0; y < h; ++y) {
+      MD5Update(md5, buf, w);
+      buf += stride;
+    }
+  }
+}
+
+static void write_image_file(const vpx_image_t *img, const int planes[3],
+                             FILE *file) {
+  int i, y;
+
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = get_image_plane_width(plane, img);
+    const int h = get_image_plane_height(plane, img);
+
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
   }
 }
 
@@ -415,7 +468,6 @@
   } while (*p);
 }
 
-
 int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
@@ -453,8 +505,6 @@
   int                     num_external_frame_buffers = 0;
   int                     fb_lru_cache = 0;
   vpx_codec_frame_buffer_t *frame_buffers = NULL;
-  int                     display_width = 0;
-  int                     display_height = 0;
 
   struct VpxDecInputContext input = {0};
   struct VpxInputContext vpx_input_ctx = {0};
@@ -665,13 +715,12 @@
              vpx_input_ctx.framerate.numerator,
              vpx_input_ctx.framerate.denominator,
              'p');
-    out_put(out, (unsigned char *)buffer,
-            (unsigned int)strlen(buffer), do_md5);
+    fwrite(buffer, 1, strlen(buffer), out);
   }
 
   /* Try to determine the codec from the fourcc. */
   for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    if ((vpx_input_ctx.fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
+    if (vpx_input_ctx.fourcc == ifaces[i].fourcc) {
       vpx_codec_iface_t *vpx_iface = ifaces[i].iface();
 
       if (iface && iface != vpx_iface)
@@ -821,7 +870,7 @@
             img->fmt == VPX_IMG_FMT_I422 ? "C422\n" :
             "C420jpeg\n";
 
-        out_put(out, (const unsigned char*)color, strlen(color), do_md5);
+        fwrite(color, 1, strlen(color), out);
       }
 
       if (img && do_scale) {
@@ -830,8 +879,8 @@
           // use the width and height specified in the container. If either of
           // these is set to 0, use the display size set in the first frame
           // header.
-          display_width = vpx_input_ctx.width;
-          display_height = vpx_input_ctx.height;
+          int display_width = vpx_input_ctx.width;
+          int display_height = vpx_input_ctx.height;
           if (!display_width || !display_height) {
             int display_size[2];
             if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE,
@@ -848,64 +897,32 @@
                                      display_height, 16);
         }
 
-        if (img->d_w != display_width || img->d_h != display_height) {
-          assert(img->fmt == VPX_IMG_FMT_I420);
-          I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
-                    img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
-                    img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
-                    img->d_w, img->d_h,
-                    scaled_img->planes[VPX_PLANE_Y],
-                    scaled_img->stride[VPX_PLANE_Y],
-                    scaled_img->planes[VPX_PLANE_U],
-                    scaled_img->stride[VPX_PLANE_U],
-                    scaled_img->planes[VPX_PLANE_V],
-                    scaled_img->stride[VPX_PLANE_V],
-                    display_width, display_height,
-                    kFilterBox);
+        if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+          vpx_image_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
         }
       }
+
       if (img) {
-        unsigned int y;
+        const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V};
+        const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U};
+
+        const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
         char out_fn[PATH_MAX];
-        uint8_t *buf;
-        unsigned int c_w =
-            img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift
-                                : img->d_w;
-        unsigned int c_h =
-            img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift
-                                : img->d_h;
 
         if (!single_file) {
-          size_t len = sizeof(out_fn) - 1;
-
-          out_fn[len] = '\0';
-          generate_filename(outfile_pattern, out_fn, len - 1,
+          generate_filename(outfile_pattern, out_fn, PATH_MAX,
                             img->d_w, img->d_h, frame_in);
           out = out_open(out_fn, do_md5);
-        } else if (use_y4m)
-          out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
-
-        buf = img->planes[VPX_PLANE_Y];
-
-        for (y = 0; y < img->d_h; y++) {
-          out_put(out, buf, img->d_w, do_md5);
-          buf += img->stride[VPX_PLANE_Y];
+        } else {
+          if (use_y4m)
+            fwrite("FRAME\n", 1, 6, out);
         }
 
-        buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];
-
-        for (y = 0; y < c_h; y++) {
-          out_put(out, buf, c_w, do_md5);
-          buf += img->stride[VPX_PLANE_U];
-        }
-
-        buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];
-
-        for (y = 0; y < c_h; y++) {
-          out_put(out, buf, c_w, do_md5);
-          buf += img->stride[VPX_PLANE_V];
-        }
+        if (do_md5)
+          update_image_md5(img, planes, out);
+        else
+          write_image_file(img, planes, out);
 
         if (!single_file)
           out_close(out, out_fn, do_md5);
diff --git a/vpxenc.c b/vpxenc.c
index 4c933ce..396e43d 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -362,12 +362,6 @@
                                                "Motion detection threshold");
 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
                                           "CPU Used (-16..16)");
-static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
-                                     "Number of token partitions to use, log2");
-static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1,
-                                         "Number of tile columns to use, log2");
-static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1,
-                                           "Number of tile rows to use, log2");
 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
                                              "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
@@ -387,16 +381,10 @@
                                           "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
                                                     "Max I-frame bitrate (pct)");
-static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
-#if CONFIG_VP9_ENCODER
-static const arg_def_t frame_parallel_decoding  = ARG_DEF(
-    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
-static const arg_def_t aq_mode  = ARG_DEF(
-    NULL, "aq-mode", 1,
-    "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
-#endif
 
 #if CONFIG_VP8_ENCODER
+static const arg_def_t token_parts =
+    ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t *vp8_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -414,6 +402,17 @@
 #endif
 
 #if CONFIG_VP9_ENCODER
+static const arg_def_t tile_cols =
+    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
+static const arg_def_t frame_parallel_decoding = ARG_DEF(
+    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+static const arg_def_t aq_mode = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
+
 static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -1393,6 +1392,10 @@
 static void open_output_file(struct stream_state *stream,
                              struct VpxEncoderConfig *global) {
   const char *fn = stream->config.out_fn;
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
 
   stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
 
@@ -1404,18 +1407,23 @@
 
   if (stream->config.write_webm) {
     stream->ebml.stream = stream->file;
-    write_webm_file_header(&stream->ebml, &stream->config.cfg,
+    write_webm_file_header(&stream->ebml, cfg,
                            &global->framerate,
                            stream->config.stereo_fmt,
                            global->codec->fourcc);
-  } else
-    ivf_write_file_header(stream->file, &stream->config.cfg,
-                          global->codec->fourcc, 0);
+  } else {
+    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+  }
 }
 
 
 static void close_output_file(struct stream_state *stream,
-                              unsigned int         fourcc) {
+                              unsigned int fourcc) {
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
+
   if (stream->config.write_webm) {
     write_webm_file_footer(&stream->ebml, stream->hash);
     free(stream->ebml.cue_list);
diff --git a/warnings.c b/warnings.c
index a9c7d04..6defde9 100644
--- a/warnings.c
+++ b/warnings.c
@@ -78,9 +78,17 @@
     add_warning(quantizer_warning_string, warning_list);
 }
 
+static void check_lag_in_frames_realtime_deadline(
+    int lag_in_frames,
+    int deadline,
+    struct WarningList *warning_list) {
+  if (deadline == VPX_DL_REALTIME && lag_in_frames != 0)
+    add_warning(lag_in_frames_with_realtime, warning_list);
+}
+
 void check_encoder_config(int disable_prompt,
                           const struct VpxEncoderConfig *global_config,
-                          struct vpx_codec_enc_cfg *stream_config) {
+                          const struct vpx_codec_enc_cfg *stream_config) {
   int num_warnings = 0;
   struct WarningListNode *warning = NULL;
   struct WarningList warning_list = {0};
@@ -88,10 +96,9 @@
   check_quantizer(stream_config->rc_min_quantizer,
                   stream_config->rc_max_quantizer,
                   &warning_list);
-
-  if (global_config->deadline == VPX_DL_REALTIME)
-    stream_config->g_lag_in_frames = 0;
-
+  check_lag_in_frames_realtime_deadline(stream_config->g_lag_in_frames,
+                                        global_config->deadline,
+                                        &warning_list);
   /* Count and print warnings. */
   for (warning = warning_list.warning_node;
        warning != NULL;
diff --git a/warnings.h b/warnings.h
index 90b34c8..ac3a4b6 100644
--- a/warnings.h
+++ b/warnings.h
@@ -20,6 +20,6 @@
  */
 void check_encoder_config(int disable_prompt,
                           const struct VpxEncoderConfig *global_config,
-                          struct vpx_codec_enc_cfg *stream_config);
+                          const struct vpx_codec_enc_cfg *stream_config);
 
 #endif  // WARNINGS_H_
diff --git a/webmdec.c b/webmdec.c
index 0c75d7a..fdcf3a5 100644
--- a/webmdec.c
+++ b/webmdec.c
@@ -82,9 +82,9 @@
 
   codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
   if (codec_id == NESTEGG_CODEC_VP8) {
-    vpx_ctx->fourcc = VP8_FOURCC_MASK;
+    vpx_ctx->fourcc = VP8_FOURCC;
   } else if (codec_id == NESTEGG_CODEC_VP9) {
-    vpx_ctx->fourcc = VP9_FOURCC_MASK;
+    vpx_ctx->fourcc = VP9_FOURCC;
   } else {
     fatal("Not VPx video, quitting.\n");
   }