Merge "General cleanup in vp9_encodeframe.c."
diff --git a/examples.mk b/examples.mk
index b60786a..5c6e42d 100644
--- a/examples.mk
+++ b/examples.mk
@@ -62,6 +62,7 @@
 vp9_spatial_scalable_encoder.SRCS += tools_common.c tools_common.h
 vp9_spatial_scalable_encoder.SRCS += video_common.h
 vp9_spatial_scalable_encoder.SRCS += video_writer.h video_writer.c
+vp9_spatial_scalable_encoder.SRCS += vpxstats.c vpxstats.h
 vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
 vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
 
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c
index 98dc3f5..5333b11 100644
--- a/examples/vp9_spatial_scalable_encoder.c
+++ b/examples/vp9_spatial_scalable_encoder.c
@@ -26,6 +26,7 @@
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "./vpxstats.h"
 
 static const struct arg_enum_list encoding_mode_enum[] = {
   {"i", INTER_LAYER_PREDICTION_I},
@@ -60,12 +61,19 @@
 static const arg_def_t quantizers_keyframe_arg =
     ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
         "to highest layer)");
+static const arg_def_t passes_arg =
+    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg =
+    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name_arg =
+    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 
 static const arg_def_t *svc_args[] = {
   &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
   &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
-  &quantizers_keyframe_arg, NULL
+  &quantizers_keyframe_arg,               &passes_arg,       &pass_arg,
+  &fpf_name_arg,      NULL
 };
 
 static const SVC_ENCODING_MODE default_encoding_mode =
@@ -85,6 +93,10 @@
   const char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
+  struct VpxInputContext input_ctx;
+  stats_io_t rc_stats;
+  int passes;
+  int pass;
 } AppInput;
 
 static const char *exec_name;
@@ -105,6 +117,9 @@
   char **argi = NULL;
   char **argj = NULL;
   vpx_codec_err_t res;
+  int passes = 0;
+  int pass = 0;
+  const char *fpf_file_name = NULL;
 
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
   svc_ctx->log_level = SVC_LOG_DEBUG;
@@ -159,11 +174,53 @@
       vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
     } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
       vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
+    } else if (arg_match(&arg, &passes_arg, argi)) {
+      passes = arg_parse_uint(&arg);
+      if (passes < 1 || passes > 2) {
+        die("Error: Invalid number of passes (%d)\n", passes);
+      }
+    } else if (arg_match(&arg, &pass_arg, argi)) {
+      pass = arg_parse_uint(&arg);
+      if (pass < 1 || pass > 2) {
+        die("Error: Invalid pass selected (%d)\n", pass);
+      }
+    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
+      fpf_file_name = arg.val;
     } else {
       ++argj;
     }
   }
 
+  if (passes == 0 || passes == 1) {
+    if (pass) {
+      fprintf(stderr, "pass is ignored since there's only one pass\n");
+    }
+    enc_cfg->g_pass = VPX_RC_ONE_PASS;
+  } else {
+    if (pass == 0) {
+      die("pass must be specified when passes is 2\n");
+    }
+
+    if (fpf_file_name == NULL) {
+      die("fpf must be specified when passes is 2\n");
+    }
+
+    if (pass == 1) {
+      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
+      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
+        fatal("Failed to open statistics store");
+      }
+    } else {
+      enc_cfg->g_pass = VPX_RC_LAST_PASS;
+      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
+        fatal("Failed to open statistics store");
+      }
+      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
+    }
+    app_input->passes = passes;
+    app_input->pass = pass;
+  }
+
   // Check for unrecognized options
   for (argi = argv; *argi; ++argi)
     if (argi[0][0] == '-' && strlen(argi[0]) > 1)
@@ -234,10 +291,14 @@
       VPX_CODEC_OK) {
     die("Failed to get output resolution");
   }
-  writer = vpx_video_writer_open(app_input.output_filename, kContainerIVF,
-                                 &info);
-  if (!writer)
-    die("Failed to open %s for writing\n", app_input.output_filename);
+
+  if (!(app_input.passes == 2 && app_input.pass == 1)) {
+    // We don't save the bitstream for the 1st pass on two pass rate control
+    writer = vpx_video_writer_open(app_input.output_filename, kContainerIVF,
+                                   &info);
+    if (!writer)
+      die("Failed to open %s for writing\n", app_input.output_filename);
+  }
 
   // skip initial frames
   for (i = 0; i < app_input.frames_to_skip; ++i)
@@ -254,11 +315,18 @@
     if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
     }
-    if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
-      vpx_video_writer_write_frame(writer,
-                                   vpx_svc_get_buffer(&svc_ctx),
-                                   vpx_svc_get_frame_size(&svc_ctx),
-                                   pts);
+    if (!(app_input.passes == 2 && app_input.pass == 1)) {
+      if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+        vpx_video_writer_write_frame(writer,
+                                     vpx_svc_get_buffer(&svc_ctx),
+                                     vpx_svc_get_frame_size(&svc_ctx),
+                                     pts);
+      }
+    }
+    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
+      stats_write(&app_input.rc_stats,
+                  vpx_svc_get_rc_stats_buffer(&svc_ctx),
+                  vpx_svc_get_rc_stats_buffer_size(&svc_ctx));
     }
     ++frame_cnt;
     pts += frame_duration;
@@ -269,7 +337,12 @@
   fclose(infile);
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
-  vpx_video_writer_close(writer);
+  if (app_input.passes == 2)
+    stats_close(&app_input.rc_stats, 1);
+
+  if (writer) {
+    vpx_video_writer_close(writer);
+  }
 
   vpx_img_free(&raw);
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 7793cca..022223f 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -576,6 +576,11 @@
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
-                          ::testing::Values(::libvpx_test::kOnePassGood),
-                          ::testing::Range(2, 6));
+                          ::testing::Values(::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+                          ::testing::Range(2, 7));
+// TODO(marpan): Speed 7 fails on one of these tests (likely just a threshold
+// needs to be changed), so for now test up to speed 6, and start at 2 (since
+// speed 0 and 1 are slow). Allow speed 7 (for real-time mode) after
+// looking into/fix failing issue.
 }  // namespace
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 868a66a..af8afed 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1213,7 +1213,7 @@
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
-      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+      vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
       if (use_420)
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 69452d3..52889f7 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -269,7 +269,7 @@
   xd->left_available  = (mi_col > tile->mi_col_start);
 }
 
-static void set_prev_mi(VP9_COMMON *cm) {
+static INLINE void set_prev_mi(VP9_COMMON *cm) {
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
                                        !cm->intra_only &&
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index bdcfafa..e357b36 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -385,3 +385,38 @@
     }
   }
 }
+
+void vp9_setup_dst_planes(MACROBLOCKD *xd,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                               src->alpha_buffer};
+  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                          src->alpha_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *sf) {
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                 src->alpha_buffer};
+    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                            src->alpha_stride};
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+                       sf, pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index dccd609..86f3158 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -57,41 +57,12 @@
   dst->stride = stride;
 }
 
-// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col
-static void setup_dst_planes(MACROBLOCKD *xd,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col) {
-  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                               src->alpha_buffer};
-  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                          src->alpha_stride};
-  int i;
+void vp9_setup_dst_planes(MACROBLOCKD *xd, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col);
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
-                     pd->subsampling_x, pd->subsampling_y);
-  }
-}
-
-static void setup_pre_planes(MACROBLOCKD *xd, int idx,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *sf) {
-  if (src != NULL) {
-    int i;
-    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                 src->alpha_buffer};
-    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                            src->alpha_stride};
-
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
-                       sf, pd->subsampling_x, pd->subsampling_y);
-    }
-  }
-}
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index c04de5d..1cb741f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -362,7 +362,7 @@
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
-  setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
+  vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
 }
 
 static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -373,7 +373,8 @@
   if (!vp9_is_valid_scale(&ref_buffer->sf))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
-  setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col, &ref_buffer->sf);
+  vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
+                       &ref_buffer->sf);
   xd->corrupted |= ref_buffer->buf->corrupted;
 }
 
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 2e00a8b..7d2179d 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -107,7 +107,7 @@
 
       sync_read(lf_sync, r, c);
 
-      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+      vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
       vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
                      &lfm);
 
diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
new file mode 100644
index 0000000..778a635
--- /dev/null
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/decoder/vp9_read_bit_buffer.h"
+
+size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
+  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+}
+
+int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
+  const size_t off = rb->bit_offset;
+  const size_t p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
+  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
+  } else {
+    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+    rb->bit_offset = off + 1;
+    return bit;
+  }
+}
+
+int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= vp9_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+                               int bits) {
+  const int value = vp9_rb_read_literal(rb, bits);
+  return vp9_rb_read_bit(rb) ? -value : value;
+}
diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
index 8cb4247..fc88bd7 100644
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -30,36 +30,13 @@
   vp9_rb_error_handler error_handler;
 };
 
-static size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
-}
+size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb);
 
-static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
-  const size_t off = rb->bit_offset;
-  const size_t p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
-  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  } else {
-    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
-    rb->bit_offset = off + 1;
-    return bit;
-  }
-}
+int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb);
 
-static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
-  int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    value |= vp9_rb_read_bit(rb) << bit;
-  return value;
-}
+int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits);
 
-static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
-                                      int bits) {
-  const int value = vp9_rb_read_literal(rb, bits);
-  return vp9_rb_read_bit(rb) ? -value : value;
-}
+int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, int bits);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3826718..585b690 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -35,10 +35,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
-#ifdef ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
 static struct vp9_token intra_mode_encodings[INTRA_MODES];
 static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
 static struct vp9_token partition_encodings[PARTITION_TYPES];
@@ -98,13 +94,13 @@
   }
 }
 
-static int write_skip(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+static int write_skip(const VP9_COMP *cpi, int segment_id, const MODE_INFO *mi,
                       vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = m->mbmi.skip;
+    const int skip = mi->mbmi.skip;
     vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
     return skip;
   }
@@ -229,126 +225,107 @@
   }
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+                                vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc.nmvc;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  const MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
-  const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1];
-  const MB_PREDICTION_MODE mode = mi->mode;
-  const int segment_id = mi->segment_id;
-  const BLOCK_SIZE bsize = mi->sb_type;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  int skip;
-
-#ifdef ENTROPY_STATS
-  active_section = 9;
-#endif
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int skip, ref;
 
   if (seg->update_map) {
     if (seg->temporal_update) {
-      const int pred_flag = mi->seg_id_predicted;
+      const int pred_flag = mbmi->seg_id_predicted;
       vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-      vp9_write(bc, pred_flag, pred_prob);
+      vp9_write(w, pred_flag, pred_prob);
       if (!pred_flag)
-        write_segment_id(bc, seg, segment_id);
+        write_segment_id(w, seg, segment_id);
     } else {
-      write_segment_id(bc, seg, segment_id);
+      write_segment_id(w, seg, segment_id);
     }
   }
 
-  skip = write_skip(cpi, segment_id, m, bc);
+  skip = write_skip(cpi, segment_id, mi, w);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, ref0 != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
+    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(ref0 != INTRA_FRAME &&
+      !(is_inter &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, mi->tx_size, bsize, bc);
+    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
   }
 
-  if (ref0 == INTRA_FRAME) {
-#ifdef ENTROPY_STATS
-    active_section = 6;
-#endif
-
+  if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
-      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-          const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const MB_PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
+    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
-    vp9_prob *mv_ref_p;
-    write_ref_frames(cpi, bc);
-    mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
-
-#ifdef ENTROPY_STATS
-    active_section = 3;
-#endif
+    const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+    write_ref_frames(cpi, w);
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_inter_mode(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(mode)];
+        write_inter_mode(w, mode, inter_probs);
+        ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
       }
     }
 
     if (cm->interp_filter == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      vp9_write_token(bc, vp9_switchable_interp_tree,
+      vp9_write_token(w, vp9_switchable_interp_tree,
                       cm->fc.switchable_interp_prob[ctx],
-                      &switchable_interp_encodings[mi->interp_filter]);
+                      &switchable_interp_encodings[mbmi->interp_filter]);
     } else {
-      assert(mi->interp_filter == cm->interp_filter);
+      assert(mbmi->interp_filter == cm->interp_filter);
     }
 
     if (bsize < BLOCK_8X8) {
-      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
-      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE b_mode = m->bmi[j].as_mode;
-          write_inter_mode(bc, b_mode, mv_ref_p);
-          ++cm->counts.inter_mode[mi->mode_context[ref0]][INTER_OFFSET(b_mode)];
+          const MB_PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+          write_inter_mode(w, b_mode, inter_probs);
+          ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
           if (b_mode == NEWMV) {
-#ifdef ENTROPY_STATS
-            active_section = 11;
-#endif
-            vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
-
-            if (has_second_ref(mi))
-              vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
+            for (ref = 0; ref < 1 + is_compound; ++ref)
+              vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+                            &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+                            nmvc, allow_hp);
           }
         }
       }
-    } else if (mode == NEWMV) {
-#ifdef ENTROPY_STATS
-      active_section = 5;
-#endif
-      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->ref_mvs[ref0][0].as_mv, nmvc, allow_hp);
-
-      if (has_second_ref(mi))
-        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->ref_mvs[ref1][0].as_mv, nmvc, allow_hp);
+    } else {
+      if (mode == NEWMV) {
+        for (ref = 0; ref < 1 + is_compound; ++ref)
+          vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
+                        &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
+                        allow_hp);
+      }
     }
   }
 }
@@ -410,14 +387,8 @@
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cpi, xd->mi_8x8, w);
-#ifdef ENTROPY_STATS
-    active_section = 8;
-#endif
   } else {
     pack_inter_mode_mvs(cpi, m, w);
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
   }
 
   assert(*tok < tok_end);
@@ -1151,18 +1122,10 @@
     encode_txfm_probs(cm, &header_bc);
 
   update_coef_probs(cpi, &header_bc);
-
-#ifdef ENTROPY_STATS
-  active_section = 2;
-#endif
-
   update_skip_probs(cm, &header_bc);
 
   if (!frame_is_intra_only(cm)) {
     int i;
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
@@ -1236,13 +1199,6 @@
 
   vp9_compute_update_table();
 
-#ifdef ENTROPY_STATS
-  if (cm->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-#endif
-
   vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index c4ada8b..76ef270 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -601,7 +601,7 @@
   mbmi = &xd->mi_8x8[0]->mbmi;
 
   // Set up destination pointers
-  setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
+  vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
 
   // Set up limit values for MV components
   // mv beyond the range do not produce new/different prediction block
@@ -2076,8 +2076,9 @@
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
   // TODO(jkoleszar): are these initializations required?
-  setup_pre_planes(xd, 0, get_ref_frame_buffer(cpi, LAST_FRAME), 0, 0, NULL);
-  setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
+  vp9_setup_pre_planes(xd, 0, get_ref_frame_buffer(cpi, LAST_FRAME), 0, 0,
+                       NULL);
+  vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -2774,7 +2775,8 @@
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
                                                      mbmi->ref_frame[ref]);
-      setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf);
+      vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf);
     }
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 19dee0e..8dbd1a4 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -107,7 +107,7 @@
 
 static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, MACROBLOCK *mb,
-                       struct optimize_ctx *ctx) {
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
@@ -133,11 +133,6 @@
   const scan_order *so = get_scan(xd, tx_size, type, block);
   const int16_t *scan = so->scan;
   const int16_t *nb = so->neighbors;
-  ENTROPY_CONTEXT *a, *l;
-  int tx_x, tx_y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &tx_x, &tx_y);
-  a = &ctx->ta[plane][tx_x];
-  l = &ctx->tl[plane][tx_y];
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -380,15 +375,17 @@
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   int i, j;
   uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+  a = &ctx->ta[plane][i];
+  l = &ctx->tl[plane][j];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
     p->eobs[block] = 0;
-    ctx->ta[plane][i] = 0;
-    ctx->tl[plane][j] = 0;
+    *a = *l = 0;
     return;
   }
 
@@ -396,10 +393,9 @@
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+    optimize_b(plane, block, plane_bsize, tx_size, x, a, l);
   } else {
-    ctx->ta[plane][i] = p->eobs[block] > 0;
-    ctx->tl[plane][j] = p->eobs[block] > 0;
+    *a = *l = p->eobs[block] > 0;
   }
 
   if (p->eobs[block])
@@ -502,9 +498,6 @@
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
-  // if (x->optimize)
-  //   optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
-
   switch (tx_size) {
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 3ba4e3f..703dde3 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -17,10 +17,6 @@
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemv.h"
 
-#ifdef ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
 static struct vp9_token mv_joint_encodings[MV_JOINTS];
 static struct vp9_token mv_class_encodings[MV_CLASSES];
 static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index d83c435..f548de5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -20,7 +20,7 @@
 
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_reconinter.h"  // setup_dst_planes()
+#include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_block.h"
@@ -95,7 +95,7 @@
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
 static void reset_fpf_position(struct twopass_rc *p,
-                               FIRSTPASS_STATS *position) {
+                               const FIRSTPASS_STATS *position) {
   p->stats_in = position;
 }
 
@@ -504,8 +504,8 @@
   vp9_clear_system_state();
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
-  setup_dst_planes(xd, new_yv12, 0, 0);
+  vp9_setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
+  vp9_setup_dst_planes(xd, new_yv12, 0, 0);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   xd->mi_8x8[0] = cm->mi;
@@ -903,7 +903,7 @@
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS *start_pos;
+  const FIRSTPASS_STATS *start_pos;
   struct twopass_rc *const twopass = &cpi->twopass;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
 
@@ -1011,7 +1011,7 @@
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    FIRSTPASS_STATS *position = cpi->twopass.stats_in;
+    const FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
@@ -1346,7 +1346,7 @@
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame = { 0 };
-  FIRSTPASS_STATS *start_pos;
+  const FIRSTPASS_STATS *start_pos;
   struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
@@ -1793,19 +1793,12 @@
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
-    FIRSTPASS_STATS *start_pos;
-
-    FIRSTPASS_STATS local_next_frame;
-
+    const FIRSTPASS_STATS *start_pos = cpi->twopass.stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
 
-    local_next_frame = *next_frame;
-
-    // Note the starting file position so we can reset to it.
-    start_pos = cpi->twopass.stats_in;
-
     // Examine how well the key frame predicts subsequent frames.
     for (i = 0; i < 16; ++i) {
       double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
@@ -1861,7 +1854,7 @@
   FIRSTPASS_STATS last_frame;
   FIRSTPASS_STATS first_frame;
   FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS *start_position;
+  const FIRSTPASS_STATS *start_position;
 
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 03c0e20..9f44a30 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -43,7 +43,9 @@
   unsigned int this_iiratio;
   FIRSTPASS_STATS total_stats;
   FIRSTPASS_STATS this_frame_stats;
-  FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
   FIRSTPASS_STATS total_left_stats;
   int first_pass_done;
   int64_t bits_left;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 158bb68..26f1a02 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -711,23 +711,21 @@
 }
 
 int vp9_get_mvpred_var(const MACROBLOCK *x,
-                       MV *best_mv,
-                       const MV *center_mv,
+                       const MV *best_mv, const MV *center_mv,
                        const vp9_variance_fn_ptr_t *vfp,
                        int use_mvcost) {
-  unsigned int bestsad;
-  MV this_mv;
+  unsigned int unused;
+
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const int in_what_stride = xd->plane[0].pre[0].stride;
   const uint8_t *base_offset = xd->plane[0].pre[0].buf;
-  const uint8_t *this_offset = base_offset + (best_mv->row * in_what_stride) +
-      best_mv->col;
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-  return vfp->vf(what, what_stride, this_offset, in_what_stride, &bestsad) +
-      (use_mvcost ?  mv_err_cost(&this_mv, center_mv, x->nmvjointcost,
+  const uint8_t *this_offset = &base_offset[best_mv->row * in_what_stride +
+                                            best_mv->col];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  return vfp->vf(what, what_stride, this_offset, in_what_stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
 }
 
@@ -884,6 +882,20 @@
                         center_mv, best_mv);
 }
 
+int vp9_fast_dia_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        int do_init_search,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                           sad_per_bit, do_init_search, vfp, use_mvcost,
+                           center_mv, best_mv);
+}
+
 #undef CHECK_BETTER
 
 int vp9_full_range_search_c(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 39360f1..917de75 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -38,8 +38,7 @@
 
 // Utility to compute variance + MV rate cost for a given MV
 int vp9_get_mvpred_var(const MACROBLOCK *x,
-                       MV *best_mv,
-                       const MV *center_mv,
+                       const MV *best_mv, const MV *center_mv,
                        const vp9_variance_fn_ptr_t *vfp,
                        int use_mvcost);
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
@@ -76,6 +75,7 @@
 integer_mv_pattern_search_fn vp9_bigdia_search;
 integer_mv_pattern_search_fn vp9_square_search;
 integer_mv_pattern_search_fn vp9_fast_hex_search;
+integer_mv_pattern_search_fn vp9_fast_dia_search;
 
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 61881d8..57d2c78 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -872,7 +872,7 @@
   if (speed >= 7) {
     sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
     sf->use_nonrd_pick_mode = 1;
-    sf->search_method = FAST_HEX;
+    sf->search_method = FAST_DIAMOND;
   }
   if (speed >= 8) {
     int i;
@@ -1879,13 +1879,12 @@
   if (cpi->pass == 1) {
     vp9_init_first_pass(cpi);
   } else if (cpi->pass == 2) {
-    size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
     cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
     cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
-                                         + (packets - 1) * packet_sz);
+    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
     vp9_init_second_pass(cpi);
   }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index a8e7b69..4ee035b 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -137,7 +137,8 @@
   HEX = 2,
   BIGDIA = 3,
   SQUARE = 4,
-  FAST_HEX = 5
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
 } SEARCH_METHODS;
 
 typedef enum {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 92409a1..c10b4f3 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -59,7 +59,7 @@
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
 
-    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
   vp9_set_mv_search_range(x, &ref_mv);
@@ -86,7 +86,12 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  if (cpi->sf.search_method == FAST_HEX) {
+  if (cpi->sf.search_method == FAST_DIAMOND) {
+    // NOTE: this returns SAD
+    vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
+                        &cpi->fn_ptr[bsize], 1,
+                        &ref_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == FAST_HEX) {
     // NOTE: this returns SAD
     vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
                         &cpi->fn_ptr[bsize], 1,
@@ -165,7 +170,7 @@
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
 
-    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
   tmp_mv->col >>= 3;
@@ -193,9 +198,9 @@
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
   unsigned int sse;
+  int rate;
+  int64_t dist;
 
 
   struct macroblock_plane *const p = &x->plane[0];
@@ -205,18 +210,11 @@
   (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                             pd->dst.buf, pd->dst.stride, &sse);
 
-  {
-    int rate;
-    int64_t dist;
-    vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
-                                 pd->dequant[1] >> 3, &rate, &dist);
-    rate_sum += rate;
-    dist_sum += dist;
-  }
+  vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+                               pd->dequant[1] >> 3, &rate, &dist);
 
-
-  *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_rate_sum = rate;
+  *out_dist_sum = dist << 4;
 }
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
@@ -312,6 +310,11 @@
                                 &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
+      if (this_mode != NEARESTMV)
+        if (frame_mv[this_mode][ref_frame].as_int ==
+            frame_mv[NEARESTMV][ref_frame].as_int)
+          continue;
+
       mbmi->mode = this_mode;
       mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
@@ -357,5 +360,6 @@
       }
     }
   }
+
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 7b14fdd..0542a34 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2401,7 +2401,7 @@
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
 
-    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
   vp9_set_mv_search_range(x, &ref_mv);
@@ -2457,7 +2457,14 @@
   // Further step/diamond searches as necessary
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  if (cpi->sf.search_method == FAST_HEX) {
+  if (cpi->sf.search_method == FAST_DIAMOND) {
+    bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv, &tmp_mv->as_mv);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
+                                   &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == FAST_HEX) {
     bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
                                   &cpi->fn_ptr[bsize], 1,
                                   &ref_mv, &tmp_mv->as_mv);
@@ -2557,7 +2564,8 @@
       // motion search code to be used without additional modifications.
       for (i = 0; i < MAX_MB_PLANE; i++)
         backup_yv12[ref][i] = xd->plane[i].pre[ref];
-      setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
+      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
     }
 
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c
index 0697373..8398fc0 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vp9/encoder/vp9_writer.c
@@ -12,10 +12,6 @@
 #include "vp9/encoder/vp9_writer.h"
 #include "vp9/common/vp9_entropy.h"
 
-#ifdef ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
-
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 4967baf..40fb575 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -25,6 +25,7 @@
 VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.c
+VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
diff --git a/vpx/exports_enc b/vpx/exports_enc
index 99b1bfa..155faf6 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -21,3 +21,5 @@
 text vpx_svc_set_quantizers
 text vpx_svc_set_scale_factors
 text vpx_svc_get_layer_resolution
+text vpx_svc_get_rc_stats_buffer_size
+text vpx_svc_get_rc_stats_buffer
\ No newline at end of file
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index c783724..3e22fdf 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -81,6 +81,10 @@
   size_t buffer_size;
   void *buffer;
 
+  char *rc_stats_buf;
+  size_t rc_stats_buf_size;
+  size_t rc_stats_buf_used;
+
   char message_buffer[2048];
   vpx_codec_ctx_t *codec_ctx;
 } SvcInternal;
@@ -569,7 +573,6 @@
   enc_cfg->ss_number_layers = si->layers;
   enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
   enc_cfg->kf_mode = VPX_KF_DISABLED;
-  enc_cfg->g_pass = VPX_RC_ONE_PASS;
   // Lag in frames not currently supported
   enc_cfg->g_lag_in_frames = 0;
 
@@ -851,6 +854,7 @@
 
   memset(&superframe, 0, sizeof(superframe));
   svc_log_reset(svc_ctx);
+  si->rc_stats_buf_used = 0;
 
   si->layers = svc_ctx->spatial_layers;
   if (si->frame_within_gop >= si->kf_dist ||
@@ -923,6 +927,25 @@
           }
           break;
         }
+        case VPX_CODEC_STATS_PKT: {
+          size_t new_size = si->rc_stats_buf_used +
+              cx_pkt->data.twopass_stats.sz;
+
+          if (new_size > si->rc_stats_buf_size) {
+            char *p = (char*)realloc(si->rc_stats_buf, new_size);
+            if (p == NULL) {
+              svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
+              break;
+            }
+            si->rc_stats_buf = p;
+            si->rc_stats_buf_size = new_size;
+          }
+
+          memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
+                 cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
+          si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
+          break;
+        }
         default: {
           break;
         }
@@ -1077,7 +1100,24 @@
   si = (SvcInternal *)svc_ctx->internal;
   if (si != NULL) {
     free(si->buffer);
+    if (si->rc_stats_buf) {
+      free(si->rc_stats_buf);
+    }
     free(si);
     svc_ctx->internal = NULL;
   }
 }
+
+size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return 0;
+  return si->rc_stats_buf_used;
+}
+
+char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+  return si->rc_stats_buf;
+}
+
+
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index 98474ca..5d0fbbd 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -114,6 +114,17 @@
 void *vpx_svc_get_buffer(const SvcContext *svc_ctx);
 
 /**
+ * return size of two pass rate control stats data to be returned by
+ * vpx_svc_get_rc_stats_buffer
+ */
+size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx);
+
+/**
+ * return buffer two pass of rate control stats data
+ */
+char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx);
+
+/**
  * return spatial resolution of the specified layer
  */
 vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 28e168e..8c92570 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -23,7 +23,6 @@
 specialize qw/vp8_yv12_copy_frame neon/;
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vpx_yv12_copy_y neon/;
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
     add_proto qw/void vp9_extend_frame_borders/, "struct yv12_buffer_config *ybf";