Merge "Using local variable for token_cache."
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c
index 5c80d34..64e62ef 100644
--- a/examples/vp9_spatial_scalable_encoder.c
+++ b/examples/vp9_spatial_scalable_encoder.c
@@ -67,13 +67,22 @@
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name_arg =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
+static const arg_def_t min_q_arg =
+    ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
+static const arg_def_t max_q_arg =
+    ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
+static const arg_def_t min_bitrate_arg =
+    ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
+static const arg_def_t max_bitrate_arg =
+    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
 
 static const arg_def_t *svc_args[] = {
   &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
   &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
-  &quantizers_keyframe_arg,               &passes_arg,       &pass_arg,
-  &fpf_name_arg,      NULL
+  &quantizers_keyframe_arg,               &passes_arg,      &pass_arg,
+  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
+  &max_bitrate_arg,   NULL
 };
 
 static const SVC_ENCODING_MODE default_encoding_mode =
@@ -120,6 +129,8 @@
   int passes = 0;
   int pass = 0;
   const char *fpf_file_name = NULL;
+  unsigned int min_bitrate = 0;
+  unsigned int max_bitrate = 0;
 
   // initialize SvcContext with parameters that will be passed to vpx_svc_init
   svc_ctx->log_level = SVC_LOG_DEBUG;
@@ -186,6 +197,14 @@
       }
     } else if (arg_match(&arg, &fpf_name_arg, argi)) {
       fpf_file_name = arg.val;
+    } else if (arg_match(&arg, &min_q_arg, argi)) {
+      enc_cfg->rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_q_arg, argi)) {
+      enc_cfg->rc_max_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
+      min_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
+      max_bitrate = arg_parse_uint(&arg);
     } else {
       ++argj;
     }
@@ -221,6 +240,17 @@
     app_input->pass = pass;
   }
 
+  if (enc_cfg->rc_target_bitrate > 0) {
+    if (min_bitrate > 0) {
+      enc_cfg->rc_2pass_vbr_minsection_pct =
+          min_bitrate * 100 / enc_cfg->rc_target_bitrate;
+    }
+    if (max_bitrate > 0) {
+      enc_cfg->rc_2pass_vbr_maxsection_pct =
+          max_bitrate * 100 / enc_cfg->rc_target_bitrate;
+    }
+  }
+
   // Check for unrecognized options
   for (argi = argv; *argi; ++argi)
     if (argi[0][0] == '-' && strlen(argi[0]) > 1)
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ad11d1c..cb75870 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -17,16 +17,15 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
-  const int stride = cm->mode_info_stride;
+static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
   int i;
 
-  // Clear down top border row
-  vpx_memset(mi, 0, sizeof(MODE_INFO) * stride);
+  // Top border row
+  vpx_memset(mi, 0, sizeof(*mi) * cm->mode_info_stride);
 
-  // Clear left border column
-  for (i = 1; i < cm->mi_rows + 1; i++)
-    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
+  // Left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    vpx_memset(&mi[i * cm->mode_info_stride], 0, sizeof(*mi));
 }
 
 static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -52,7 +51,7 @@
              cm->mode_info_stride * (cm->mi_rows + 1) *
              sizeof(*cm->mi_grid_base));
 
-  vp9_update_mode_info_border(cm, cm->prev_mip);
+  clear_mi_border(cm, cm->prev_mip);
 }
 
 static int alloc_mi(VP9_COMMON *cm, int mi_size) {
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index fca6935..429e929 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -20,8 +20,6 @@
 
 void vp9_initialize_common();
 
-void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
-
 void vp9_remove_common(VP9_COMMON *cm);
 
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index af8afed..e48d417 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -228,6 +228,12 @@
   }
 }
 
+static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
+                                const MB_MODE_INFO *mbmi) {
+  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
+                   [mode_lf_lut[mbmi->mode]];
+}
+
 void vp9_loop_filter_init(VP9_COMMON *cm) {
   loop_filter_info_n *lfi = &cm->lf_info;
   struct loopfilter *lf = &cm->lf;
@@ -493,27 +499,25 @@
                         const MODE_INFO *mi, const int shift_y,
                         const int shift_uv,
                         LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
-  const int skip = mi->mbmi.skip;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
-  uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
-  uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
-  uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
   int i;
-  int w = num_8x8_blocks_wide_lookup[block_size];
-  int h = num_8x8_blocks_high_lookup[block_size];
 
   // If filter level is 0 we don't loop filter.
   if (!filter_level) {
     return;
   } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
       vpx_memset(&lfm->lfl_y[index], filter_level, w);
@@ -540,7 +544,7 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if (skip && ref > INTRA_FRAME)
+  if (mbmi->skip && is_inter_block(mbmi))
     return;
 
   // Here we are adding a mask for the transform size.  The transform
@@ -561,12 +565,11 @@
   // boundaries.  These differ from the 4x4 boundaries on the outside edge of
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
-  if (tx_size_y == TX_4X4) {
+  if (tx_size_y == TX_4X4)
     *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
-  if (tx_size_uv == TX_4X4) {
+
+  if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
-  }
 }
 
 // This function does the same thing as the one above with the exception that
@@ -575,22 +578,20 @@
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
                          LOOP_FILTER_MASK *lfm) {
-  const BLOCK_SIZE block_size = mi->mbmi.sb_type;
-  const TX_SIZE tx_size_y = mi->mbmi.tx_size;
-  const int skip = mi->mbmi.skip;
-  const int seg = mi->mbmi.segment_id;
-  const int ref = mi->mbmi.ref_frame[0];
-  const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]];
-  uint64_t *left_y = &lfm->left_y[tx_size_y];
-  uint64_t *above_y = &lfm->above_y[tx_size_y];
-  uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
   int i;
-  int w = num_8x8_blocks_wide_lookup[block_size];
-  int h = num_8x8_blocks_high_lookup[block_size];
 
   if (!filter_level) {
     return;
   } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
       vpx_memset(&lfm->lfl_y[index], filter_level, w);
@@ -601,7 +602,7 @@
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if (skip && ref > INTRA_FRAME)
+  if (mbmi->skip && is_inter_block(mbmi))
     return;
 
   *above_y |= (size_mask[block_size] &
@@ -610,9 +611,8 @@
   *left_y |= (size_mask[block_size] &
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
-  if (tx_size_y == TX_4X4) {
+  if (tx_size_y == TX_4X4)
     *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
-  }
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
@@ -868,13 +868,6 @@
   assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 
-static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi) {
-  const int seg = mbmi->segment_id;
-  const int ref = mbmi->ref_frame[0];
-  return lfi_n->lvl[seg][ref][mode_lf_lut[mbmi->mode]];
-}
-
 static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_16x16,
                                     unsigned int mask_8x8,
@@ -953,7 +946,7 @@
 
       // Filter level can vary per MI
       if (!(lfl[(r << 3) + (c >> ss_x)] =
-          build_lfi(&cm->lf_info, &mi[0].mbmi)))
+            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
         continue;
 
       // Build masks based on the transform size of each block
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a049db1..3cc12cf 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -286,12 +286,12 @@
   return cm->frame_type == KEY_FRAME || cm->intra_only;
 }
 
-static INLINE void update_partition_context(
-    PARTITION_CONTEXT *above_seg_context,
-    PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize) {
-  PARTITION_CONTEXT *const above_ctx = above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = left_seg_context + (mi_row & MI_MASK);
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            int mi_row, int mi_col,
+                                            BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
 
   // num_4x4_blocks_wide_lookup[bsize] / 2
   const int bs = num_8x8_blocks_wide_lookup[bsize];
@@ -303,12 +303,11 @@
   vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
-static INLINE int partition_plane_context(
-    const PARTITION_CONTEXT *above_seg_context,
-    const PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+static INLINE int partition_plane_context(const MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
 
   const int bsl = mi_width_log2(bsize);
   const int bs = 1 << bsl;
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 915c1c1..44951b5 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -347,6 +347,8 @@
   x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
   y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
 
+  vpx_memset(left_col, 129, 64);
+
   // left
   if (left_available) {
     if (xd->mb_to_bottom_edge < 0) {
@@ -366,8 +368,6 @@
       for (i = 0; i < bs; ++i)
         left_col[i] = ref[i * ref_stride - 1];
     }
-  } else {
-    vpx_memset(left_col, 129, bs);
   }
 
   // TODO(hkuang) do not extend 2*bs pixels for all modes.
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e4cd9d4..b874ef3 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -636,7 +636,7 @@
 specialize qw/vp9_sad4x4x8 sse4/;
 
 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2/;
 
 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -651,7 +651,7 @@
 specialize qw/vp9_sad16x32x4d sse2/;
 
 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2/;
 
 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x16x4d sse2/;
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 30fa94c..5e1ed5c 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -407,9 +407,7 @@
 static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
                                      int mi_row, int mi_col, BLOCK_SIZE bsize,
                                      vp9_reader *r) {
-  const int ctx = partition_plane_context(xd->above_seg_context,
-                                          xd->left_seg_context,
-                                          mi_row, mi_col, bsize);
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   const vp9_prob *const probs = get_partition_probs(cm, ctx);
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
@@ -474,8 +472,7 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd->above_seg_context, xd->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 static void setup_token_decoder(const uint8_t *data,
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index a9458c4..e99b21a 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -31,10 +31,8 @@
   int width;
   int height;
   int version;
-  int postprocess;
   int max_threads;
   int inv_tile_order;
-  int input_partition;
 } VP9D_CONFIG;
 
 typedef struct VP9Decompressor {
diff --git a/vp9/encoder/vp9_craq.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
similarity index 76%
rename from vp9/encoder/vp9_craq.c
rename to vp9/encoder/vp9_aq_cyclicrefresh.c
index 40437c7..231276b 100644
--- a/vp9/encoder/vp9_craq.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -11,7 +11,7 @@
 #include <limits.h>
 #include <math.h>
 
-#include "vp9/encoder/vp9_craq.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
@@ -21,31 +21,27 @@
 
 
 // Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(VP9_COMP *const cpi) {
+static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
   // Turn off cyclic refresh if bits available per frame is not sufficiently
   // larger than bit cost of segmentation. Segment map bit cost should scale
   // with number of seg blocks, so compare available bits to number of blocks.
   // Average bits available per frame = av_per_frame_bandwidth
   // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  float factor  = 0.5;
-  int number_blocks = cpi->common.mi_rows  * cpi->common.mi_cols;
+  const float factor  = 0.5;
+  const int number_blocks = cm->mi_rows  * cm->mi_cols;
   // The condition below corresponds to turning off at target bitrates:
   // ~24kbps for CIF, 72kbps for VGA (at 30fps).
-  if (cpi->rc.av_per_frame_bandwidth < factor * number_blocks)
-    return 0;
-  else
-    return 1;
+  return rc->av_per_frame_bandwidth >= factor * number_blocks;
 }
 
 // Check if this coding block, of size bsize, should be considered for refresh
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
 // mode, and rate/distortion.
-static int candidate_refresh_aq(VP9_COMP *const cpi,
-                                MODE_INFO *const mi,
-                                int bsize,
-                                int use_rd) {
-  CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi,
+                                BLOCK_SIZE bsize, int use_rd) {
   if (use_rd) {
     // If projected rate is below the thresh_rate (well below target,
     // so undershoot expected), accept it for lower-qp coding.
@@ -56,18 +52,18 @@
     // 2) mode is non-zero mv and projected distortion is above thresh_dist
     // 3) mode is an intra-mode (we may want to allow some of this under
     // another thresh_dist)
-    else if ((bsize < cr->min_block_size) ||
-        (mi->mbmi.mv[0].as_int != 0 &&
-            cr->projected_dist_sb > cr->thresh_dist_sb) ||
-            !is_inter_block(&mi->mbmi))
+    else if (bsize < cr->min_block_size ||
+             (mbmi->mv[0].as_int != 0 &&
+              cr->projected_dist_sb > cr->thresh_dist_sb) ||
+             !is_inter_block(mbmi))
       return 0;
     else
       return 1;
   } else {
     // Rate/distortion not used for update.
-    if ((bsize < cr->min_block_size) ||
-      (mi->mbmi.mv[0].as_int != 0) ||
-      !is_inter_block(&mi->mbmi))
+    if (bsize < cr->min_block_size ||
+        mbmi->mv[0].as_int != 0 ||
+        !is_inter_block(mbmi))
       return 0;
     else
       return 1;
@@ -78,32 +74,32 @@
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_update_segment_aq(VP9_COMP *const cpi,
-                           MODE_INFO *const mi,
+                           MB_MODE_INFO *const mbmi,
                            int mi_row,
                            int mi_col,
-                           int bsize,
+                           BLOCK_SIZE bsize,
                            int use_rd) {
+  const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
-  VP9_COMMON *const cm = &cpi->common;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, bsize, use_rd);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
-  int current_segment = mi->mbmi.segment_id;
-  int refresh_this_block = candidate_refresh_aq(cpi, mi, bsize, use_rd);
+
   // Check if we should reset the segment_id for this block.
-  if (current_segment && !refresh_this_block)
-    mi->mbmi.segment_id = 0;
+  if (mbmi->segment_id > 0 && !refresh_this_block)
+    mbmi->segment_id = 0;
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (mi->mbmi.segment_id == 1) {
+  if (mbmi->segment_id == 1) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -121,39 +117,40 @@
     for (x = 0; x < xmis; x++) {
       cr->map[block_index + y * cm->mi_cols + x] = new_map_value;
       cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
-          mi->mbmi.segment_id;
+          mbmi->segment_id;
     }
   // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
   // for encoding this frame.
-  if (mi->mbmi.segment_id)
+  if (mbmi->segment_id)
     cr->num_seg_blocks += xmis * ymis;
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
 void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = &cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  unsigned char *seg_map = cpi->segmentation_map;
-  int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cpi);
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh ||
-      (cpi->common.frame_type == KEY_FRAME) ||
+      (cm->frame_type == KEY_FRAME) ||
       (cpi->svc.temporal_layer_id > 0)) {
     // Set segmentation map to 0 and disable.
     vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cpi->common.frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME)
       cr->mb_index = 0;
     return;
   } else {
+    const int mbs_in_frame = cm->mi_rows * cm->mi_cols;
     int qindex_delta = 0;
-    int mbs_in_frame = cm->mi_rows * cm->mi_cols;
-    int i, x, y, block_count, bl_index, bl_index2;
-    int sum_map, new_value, mi_row, mi_col, xmis, ymis, qindex2;
+    int i, x, y, block_count;
+    int mi_row, mi_col, qindex2;
 
     // Rate target ratio to set q delta.
-    float rate_ratio_qdelta = 2.0;
+    const float rate_ratio_qdelta = 2.0;
     vp9_clear_system_state();
     // Some of these parameters may be set via codec-control function later.
     cr->max_mbs_perframe = 10;
@@ -161,14 +158,14 @@
     cr->min_block_size = BLOCK_16X16;
     cr->time_for_refresh = 1;
     // Set rate threshold to some fraction of target (and scaled by 256).
-    cr->thresh_rate_sb = (cpi->rc.sb64_target_rate * 256) >> 2;
+    cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
     // Distortion threshold, quadratic in Q, scale factor to be adjusted.
     cr->thresh_dist_sb = 8 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) *
         vp9_convert_qindex_to_q(cm->base_qindex));
     if (cpi->sf.use_nonrd_pick_mode) {
       // May want to be more conservative with thresholds in non-rd mode for now
       // as rate/distortion are derived from model based on prediction residual.
-      cr->thresh_rate_sb = (cpi->rc.sb64_target_rate * 256) >> 3;
+      cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
       cr->thresh_dist_sb = 4 * (int)(vp9_convert_qindex_to_q(cm->base_qindex) *
           vp9_convert_qindex_to_q(cm->base_qindex));
     }
@@ -200,9 +197,8 @@
                                               rate_ratio_qdelta);
     // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
     // previous encoded frame.
-    if ((-qindex_delta) > cr->max_qdelta_perc * cm->base_qindex / 100) {
+    if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
       qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
-    }
 
     // Compute rd-mult for segment 1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
@@ -238,29 +234,21 @@
     // Enforce constant segment map over superblock.
     for (mi_row = 0; mi_row < cm->mi_rows; mi_row +=  MI_BLOCK_SIZE)
       for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
-        bl_index = mi_row * cm->mi_cols + mi_col;
-        xmis = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-        ymis = num_8x8_blocks_high_lookup[BLOCK_64X64];
-        xmis = MIN(cm->mi_cols - mi_col, xmis);
-        ymis = MIN(cm->mi_rows - mi_row, ymis);
-        sum_map = 0;
+        const int bl_index = mi_row * cm->mi_cols + mi_col;
+        const int xmis = MIN(cm->mi_cols - mi_col,
+                             num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+        const int ymis = MIN(cm->mi_rows - mi_row,
+                             num_8x8_blocks_high_lookup[BLOCK_64X64]);
+        int sum_map = 0;
         for (y = 0; y < ymis; y++)
-          for (x = 0; x < xmis; x++) {
-            bl_index2 = bl_index + y * cm->mi_cols + x;
-               sum_map += seg_map[bl_index2];
-          }
-        new_value = 0;
+          for (x = 0; x < xmis; x++)
+            sum_map += seg_map[bl_index + y * cm->mi_cols + x];
         // If segment is partial over superblock, reset.
         if (sum_map > 0 && sum_map < xmis * ymis) {
-          if (sum_map < xmis * ymis / 2)
-            new_value = 0;
-          else
-            new_value = 1;
+          const int new_value = (sum_map >= xmis * ymis / 2);
           for (y = 0; y < ymis; y++)
-            for (x = 0; x < xmis; x++) {
-              bl_index2 = bl_index + y * cm->mi_cols + x;
-              seg_map[bl_index2] = new_value;
-            }
+            for (x = 0; x < xmis; x++)
+              seg_map[bl_index + y * cm->mi_cols + x] = new_value;
         }
       }
   }
diff --git a/vp9/encoder/vp9_craq.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
similarity index 89%
rename from vp9/encoder/vp9_craq.h
rename to vp9/encoder/vp9_aq_cyclicrefresh.h
index fec7748..14dc2cd 100644
--- a/vp9/encoder/vp9_craq.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -9,8 +9,8 @@
  */
 
 
-#ifndef VP9_ENCODER_VP9_CRAQ_H_
-#define VP9_ENCODER_VP9_CRAQ_H_
+#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
 
 #include "vp9/common/vp9_blockd.h"
 
@@ -51,10 +51,10 @@
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_update_segment_aq(struct VP9_COMP *const cpi,
-                           MODE_INFO *const mi,
+                           MB_MODE_INFO *const mbmi,
                            int mi_row,
                            int mi_col,
-                           int bsize,
+                           BLOCK_SIZE bsize,
                            int use_rd);
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -64,4 +64,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_CRAQ_H_
+#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_aq_variance.c
similarity index 98%
rename from vp9/encoder/vp9_vaq.c
rename to vp9/encoder/vp9_aq_variance.c
index c71c171..c25eb95 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -10,7 +10,7 @@
 
 #include <math.h>
 
-#include "vp9/encoder/vp9_vaq.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
diff --git a/vp9/encoder/vp9_vaq.h b/vp9/encoder/vp9_aq_variance.h
similarity index 86%
rename from vp9/encoder/vp9_vaq.h
rename to vp9/encoder/vp9_aq_variance.h
index c73114a..381fe50 100644
--- a/vp9/encoder/vp9_vaq.h
+++ b/vp9/encoder/vp9_aq_variance.h
@@ -9,8 +9,8 @@
  */
 
 
-#ifndef VP9_ENCODER_VP9_VAQ_H_
-#define VP9_ENCODER_VP9_VAQ_H_
+#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
@@ -31,4 +31,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_VAQ_H_
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d2df46b..a1db097 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -395,9 +395,7 @@
 static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int hbs, int mi_row, int mi_col,
                             PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
-  const int ctx = partition_plane_context(xd->above_seg_context,
-                                          xd->left_seg_context,
-                                          mi_row, mi_col, bsize);
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   const vp9_prob *const probs = get_partition_probs(cm, ctx);
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
@@ -468,8 +466,7 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd->above_seg_context, xd->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 static void write_modes(VP9_COMP *cpi,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 1d13a08..7729d84 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -196,7 +196,8 @@
 // TODO(jingning): the variables used here are little complicated. need further
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+static INLINE PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
+                                                   BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_context;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 0a958c4..d8ecb30 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -30,6 +30,8 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -38,8 +40,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_vaq.h"
-#include "vp9/encoder/vp9_craq.h"
 
 #define GF_ZEROMV_ZBIN_BOOST 0
 #define LF_ZEROMV_ZBIN_BOOST 0
@@ -903,7 +903,8 @@
       output_enabled) {
     // Check for reseting segment_id and update cyclic map.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && seg->enabled) {
-      vp9_update_segment_aq(cpi, xd->mi_8x8[0], mi_row, mi_col, bsize, 1);
+      vp9_update_segment_aq(cpi, &xd->mi_8x8[0]->mbmi,
+                            mi_row, mi_col, bsize, 1);
       vp9_init_plane_quantizers(cpi, x);
     }
     mi->mbmi.segment_id = xd->mi_8x8[0]->mbmi.segment_id;
@@ -1295,8 +1296,7 @@
     return;
 
   if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
     subsize = *get_sb_partitioning(x, bsize);
   } else {
     ctx = 0;
@@ -1351,8 +1351,7 @@
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(xd->above_seg_context, xd->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -1472,7 +1471,7 @@
 
   // Check for reseting segment_id and update cyclic map.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && seg->enabled) {
-    vp9_update_segment_aq(cpi, xd->mi_8x8[0], mi_row, mi_col, bsize, 1);
+    vp9_update_segment_aq(cpi, &xd->mi_8x8[0]->mbmi, mi_row, mi_col, bsize, 1);
     vp9_init_plane_quantizers(cpi, x);
   }
 
@@ -1526,8 +1525,7 @@
     MACROBLOCKD *const xd = &cpi->mb.e_mbd;
     const int idx_str = xd->mode_info_stride * mi_row + mi_col;
     MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
-    ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
     subsize = mi_8x8[0]->mbmi.sb_type;
   } else {
     ctx = 0;
@@ -1586,8 +1584,7 @@
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(xd->above_seg_context, xd->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 static void rd_use_partition(VP9_COMP *cpi,
@@ -1672,9 +1669,7 @@
       rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
                        get_block_context(x, bsize), INT64_MAX);
 
-      pl = partition_plane_context(xd->above_seg_context,
-                                   xd->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
       if (none_rate < INT_MAX) {
         none_rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1774,8 +1769,7 @@
       assert(0);
   }
 
-  pl = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
-                               mi_row, mi_col, bsize);
+  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
     last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
@@ -1828,14 +1822,11 @@
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
-      pl = partition_plane_context(xd->above_seg_context,
-                                   xd->left_seg_context,
-                                   mi_row + y_idx, mi_col + x_idx,
+      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
       chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
-    pl = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
-                                 mi_row, mi_col, bsize);
+    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rate < INT_MAX) {
       chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
       chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
@@ -2110,9 +2101,7 @@
                      ctx, best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(xd->above_seg_context,
-                                     xd->left_seg_context,
-                                     mi_row, mi_col, bsize);
+        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -2182,9 +2171,7 @@
       }
     }
     if (sum_rd < best_rd && i == 4) {
-      pl = partition_plane_context(xd->above_seg_context,
-                                   xd->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2240,9 +2227,7 @@
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(xd->above_seg_context,
-                                   xd->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_HORZ];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2293,9 +2278,7 @@
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(xd->above_seg_context,
-                                   xd->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_VERT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
@@ -2522,87 +2505,15 @@
   return 1;
 }
 
-static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
-                          TX_SIZE tx_size) {
-  int x, y;
-
-  for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++)
-      mi_8x8[y * mis + x]->mbmi.tx_size = tx_size;
-  }
-}
-
-static void reset_skip_txfm_size_b(const VP9_COMMON *cm, int mis,
-                                   TX_SIZE max_tx_size, int bw, int bh,
-                                   int mi_row, int mi_col,
-                                   MODE_INFO **mi_8x8) {
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
-    return;
-  } else {
-    const MB_MODE_INFO *const mbmi = &mi_8x8[0]->mbmi;
-    if (mbmi->tx_size > max_tx_size) {
-      const int ymbs = MIN(bh, cm->mi_rows - mi_row);
-      const int xmbs = MIN(bw, cm->mi_cols - mi_col);
-
-      assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-             get_skip_flag(mi_8x8, mis, ymbs, xmbs));
-      set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
-    }
-  }
-}
-
-static void reset_skip_txfm_size_sb(VP9_COMMON *cm, MODE_INFO **mi_8x8,
-                                    TX_SIZE max_tx_size, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize) {
-  const int mis = cm->mode_info_stride;
-  int bw, bh;
-  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
-
-  if (bw == bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, bs, mi_row, mi_col,
-                           mi_8x8);
-  } else if (bw == bs && bh < bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row, mi_col,
-                           mi_8x8);
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row + hbs,
-                           mi_col, mi_8x8 + hbs * mis);
-  } else if (bw < bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row, mi_col,
-                           mi_8x8);
-    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col + hbs, mi_8x8 + hbs);
-  } else {
-    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
-    int n;
-
-    assert(bw < bs && bh < bs);
-
-    for (n = 0; n < 4; n++) {
-      const int mi_dc = hbs * (n & 1);
-      const int mi_dr = hbs * (n >> 1);
-
-      reset_skip_txfm_size_sb(cm, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size,
-                              mi_row + mi_dr, mi_col + mi_dc, subsize);
-    }
-  }
-}
-
 static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) {
   int mi_row, mi_col;
   const int mis = cm->mode_info_stride;
-  MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
 
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
-    mi_8x8 = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) {
-      reset_skip_txfm_size_sb(cm, mi_8x8, txfm_max, mi_row, mi_col,
-                              BLOCK_64X64);
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max)
+        mi_ptr[mi_col]->mbmi.tx_size = txfm_max;
     }
   }
 }
@@ -2824,54 +2735,65 @@
     cpi->mb.source_variance = UINT_MAX;
 
     // Set the partition type of the 64X64 block
-    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
-      choose_partitioning(cpi, tile, mi_row, mi_col);
-    else if (cpi->sf.partition_search_type == REFERENCE_PARTITION) {
-      if (cpi->sf.partition_check) {
-        MACROBLOCK *x = &cpi->mb;
-        int rate1 = 0, rate2 = 0, rate3 = 0;
-        int64_t dist1 = 0, dist2 = 0, dist3 = 0;
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_8X8);
+    switch (cpi->sf.partition_search_type) {
+      case VAR_BASED_PARTITION:
+        choose_partitioning(cpi, tile, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate1, &dist1);
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_16X16);
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      case VAR_BASED_FIXED_PARTITION:
+      case FIXED_PARTITION:
+        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate2, &dist2);
-        set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_32X32);
-        nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                            0, &rate3, &dist3);
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      case REFERENCE_PARTITION:
+        if (cpi->sf.partition_check) {
+          MACROBLOCK *x = &cpi->mb;
+          int rate1 = 0, rate2 = 0, rate3 = 0;
+          int64_t dist1 = 0, dist2 = 0, dist3 = 0;
+          set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, BLOCK_8X8);
+          nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+                              BLOCK_64X64, 0, &rate1, &dist1);
+          set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                 BLOCK_16X16);
+          nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+                              BLOCK_64X64, 0, &rate2, &dist2);
+          set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                 BLOCK_32X32);
+          nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+                              BLOCK_64X64, 0, &rate3, &dist3);
 
-        if (RDCOST(x->rdmult, x->rddiv, rate1, dist1) <
-            RDCOST(x->rdmult, x->rddiv, rate2, dist2)) {
           if (RDCOST(x->rdmult, x->rddiv, rate1, dist1) <
-              RDCOST(x->rdmult, x->rddiv, rate3, dist3))
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_8X8);
-          else
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_32X32);
+              RDCOST(x->rdmult, x->rddiv, rate2, dist2)) {
+            if (RDCOST(x->rdmult, x->rddiv, rate1, dist1) <
+                RDCOST(x->rdmult, x->rddiv, rate3, dist3))
+              set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                     BLOCK_8X8);
+            else
+              set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                     BLOCK_32X32);
+          } else {
+            if (RDCOST(x->rdmult, x->rddiv, rate2, dist2) <
+                RDCOST(x->rdmult, x->rddiv, rate3, dist3))
+              set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                     BLOCK_16X16);
+            else
+              set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                                     BLOCK_32X32);
+          }
         } else {
-          if (RDCOST(x->rdmult, x->rddiv, rate2, dist2) <
-              RDCOST(x->rdmult, x->rddiv, rate3, dist3))
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_16X16);
+          if (!sb_has_motion(cm, prev_mi_8x8))
+            copy_partitioning(cm, mi_8x8, prev_mi_8x8);
           else
-            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
-                                   BLOCK_32X32);
+            set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         }
-
-      } else {
-        if (!sb_has_motion(cm, prev_mi_8x8))
-          copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-        else
-          set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
-      }
+        nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                            1, &dummy_rate, &dummy_dist);
+        break;
+      default:
+        assert(0);
     }
-    else
-      set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
-
-    nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, 1,
-                        &dummy_rate, &dummy_dist);
   }
 }
 // end RTC play code
@@ -3227,7 +3149,10 @@
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
                     cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) &&
-                   !cpi->sf.use_nonrd_pick_mode;
+                   !cpi->sf.use_nonrd_pick_mode &&
+                   !cpi->sf.use_uv_intra_rd_estimate &&
+                   !cpi->sf.skip_encode_sb;
+
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c4c219b..61ca996 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
 
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -34,7 +35,6 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_variance.h"
 
 #define OUTPUT_FPF 0
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 2ae8a2a..e0299f0 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -23,6 +23,11 @@
 
 // #define NEW_DIAMOND_SEARCH
 
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
 void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
   int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
   int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
@@ -370,9 +375,9 @@
   unsigned int sse;
   unsigned int whichdir;
   int thismse;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
+  const unsigned int halfiters = iters_per_step;
+  const unsigned int quarteriters = iters_per_step;
+  const unsigned int eighthiters = iters_per_step;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
@@ -399,7 +404,7 @@
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
   // pixel search, and can be passed in this function.
-  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  vp9_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
@@ -736,7 +741,6 @@
                           const vp9_variance_fn_ptr_t *vfp,
                           int use_mvcost) {
   unsigned int bestsad;
-  MV this_mv;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
@@ -744,8 +748,7 @@
   const uint8_t *base_offset = xd->plane[0].pre[0].buf;
   const uint8_t *this_offset = base_offset + (best_mv->row * in_what_stride) +
       best_mv->col;
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
+  const MV this_mv = {best_mv->row * 8, best_mv->col * 8};
   return vfp->svaf(this_offset, in_what_stride, 0, 0, what, what_stride,
                    &bestsad, second_pred) +
       (use_mvcost ?  mv_err_cost(&this_mv, center_mv, x->nmvjointcost,
@@ -908,7 +911,6 @@
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *in_what;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV this_mv;
 
   unsigned int bestsad = INT_MAX;
   int ref_row, ref_col;
@@ -960,8 +962,7 @@
 
         for (i = 0; i < 4; ++i) {
           if (sad_array[i] < bestsad) {
-            this_mv.row = ref_row + tr;
-            this_mv.col = ref_col + tc + i;
+            const MV this_mv = {ref_row + tr, ref_col + tc + i};
             thissad = sad_array[i] +
                       mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -979,8 +980,7 @@
                                 bestsad);
 
           if (thissad < bestsad) {
-            this_mv.row = ref_row + tr;
-            this_mv.col = ref_col + tc + i;
+            const MV this_mv = {ref_row + tr, ref_col + tc + i};
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
 
@@ -1331,10 +1331,8 @@
                           const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
   const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
   const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
@@ -1342,25 +1340,22 @@
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                         ref_mv->col];
-  int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride,
-                             0x7fffffff) +
+  int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
   *best_mv = *ref_mv;
 
   for (r = row_min; r < row_max; ++r) {
     for (c = col_min; c < col_max; ++c) {
-      const MV this_mv = {r, c};
-      const uint8_t *check_here = &in_what[r * in_what_stride + c];
-      const int sad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                  best_sad) +
-          mvsad_err_cost(&this_mv, &fcenter_mv,
-                         mvjsadcost, mvsadcost, sad_per_bit);
+      const MV mv = {r, c};
+      const int sad = fn_ptr->sdf(what->buf, what->stride,
+          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+          mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                         sad_per_bit);
 
       if (sad < best_sad) {
         best_sad = sad;
-        *best_mv = this_mv;
+        *best_mv = mv;
       }
     }
   }
@@ -1472,7 +1467,6 @@
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
-  unsigned int thissad;
   int ref_row = ref_mv->row;
   int ref_col = ref_mv->col;
 
@@ -1512,7 +1506,7 @@
       fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
 
       for (i = 0; i < 8; i++) {
-        thissad = (unsigned int)sad_array8[i];
+        unsigned int thissad = (unsigned int)sad_array8[i];
 
         if (thissad < bestsad) {
           this_mv.col = c;
@@ -1537,12 +1531,12 @@
       fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
 
       for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
+        unsigned int thissad = sad_array[i];
 
         if (thissad < bestsad) {
           this_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                    mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
             bestsad = thissad;
@@ -1557,8 +1551,8 @@
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
+      unsigned int thissad = fn_ptr->sdf(what, what_stride,
+                                         check_here, in_what_stride, bestsad);
 
       if (thissad < bestsad) {
         this_mv.col = c;
@@ -1585,41 +1579,34 @@
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                             ref_mv->col];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  unsigned int bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                                     in_what_stride, 0x7fffffff) +
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, ref_mv),
+                                     in_what->stride, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      const MV this_mv = {ref_mv->row + neighbors[j].row,
-                          ref_mv->col + neighbors[j].col};
-      if (is_mv_in(x, &this_mv)) {
-        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
-                                                this_mv.col];
-        unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                           in_what_stride, bestsad);
-        if (thissad < bestsad) {
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                    mvjsadcost, mvsadcost, error_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(&mv, &fcenter_mv, mvjsadcost, mvsadcost,
+                                error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -1633,7 +1620,7 @@
       ref_mv->col += neighbors[best_site].col;
     }
   }
-  return bestsad;
+  return best_sad;
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
@@ -1702,8 +1689,9 @@
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *check_here = neighbors[j].row * in_what_stride +
                                       neighbors[j].col + best_address;
-          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+          unsigned int thissad = fn_ptr->sdf(what, what_stride,
+                                             check_here, in_what_stride,
+                                             bestsad);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
@@ -1740,48 +1728,36 @@
                              int *mvjcost, int *mvcost[2],
                              const MV *center_mv,
                              const uint8_t *second_pred, int w, int h) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
                            {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
-  int i, j;
-
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
-                                             ref_mv->col];
-  unsigned int thissad;
-  MV this_mv;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  /* Get compound pred by averaging two pred blocks. */
-  unsigned int bestsad = fn_ptr->sdaf(what, what_stride,
-                                      best_address, in_what_stride,
-                                      second_pred, 0x7fffffff) +
+  unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride,
+      second_pred, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
-    for (j = 0; j < 8; j++) {
-      this_mv.row = ref_mv->row + neighbors[j].row;
-      this_mv.col = ref_mv->col + neighbors[j].col;
+    for (j = 0; j < 8; ++j) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
 
-      if (is_mv_in(x, &this_mv)) {
-        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
-                                                this_mv.col];
-
-        thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
-                               second_pred, bestsad);
-        if (thissad < bestsad) {
-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride,
+            second_pred, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(&mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -1795,5 +1771,5 @@
       ref_mv->col += neighbors[best_site].col;
     }
   }
-  return bestsad;
+  return best_sad;
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 02ba216..4eabc2f 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -27,8 +27,9 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_craq.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
@@ -39,7 +40,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_resize.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 
@@ -840,7 +840,7 @@
   if (speed >= 4) {
     sf->optimize_coefficients = 0;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    sf->use_fast_lpf_pick = 2;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
     sf->encode_breakout_thresh = 700;
   }
   if (speed >= 5) {
@@ -939,7 +939,7 @@
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
-  sf->use_fast_lpf_pick = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
   sf->use_fast_coef_updates = 0;
   sf->use_fast_coef_costing = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
@@ -1202,9 +1202,39 @@
   cm->log2_tile_rows = cpi->oxcf.tile_rows;
 }
 
+static void init_rate_control(const VP9_CONFIG *oxcf, int pass,
+                              RATE_CONTROL *rc) {
+  if (pass == 0 && oxcf->end_usage == USAGE_STREAM_FROM_SERVER) {
+    rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[0] = oxcf->best_allowed_q;
+  rc->last_q[1] = oxcf->best_allowed_q;
+  rc->last_q[2] = oxcf->best_allowed_q;
+
+  rc->buffer_level =    oxcf->starting_buffer_level;
+  rc->bits_off_target = oxcf->starting_buffer_level;
+
+  rc->rolling_target_bits      = rc->av_per_frame_bandwidth;
+  rc->rolling_actual_bits      = rc->av_per_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->av_per_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->av_per_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_vs_actual = 0;
+}
+
 static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
   int i;
 
   cpi->oxcf = *oxcf;
@@ -1230,35 +1260,6 @@
   // change includes all joint functionality
   vp9_change_config(cpi, oxcf);
 
-  // Initialize active best and worst q and average q values.
-  if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-    rc->avg_frame_qindex[0] = cpi->oxcf.worst_allowed_q;
-    rc->avg_frame_qindex[1] = cpi->oxcf.worst_allowed_q;
-    rc->avg_frame_qindex[2] = cpi->oxcf.worst_allowed_q;
-  } else {
-    rc->avg_frame_qindex[0] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[1] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-    rc->avg_frame_qindex[2] = (cpi->oxcf.worst_allowed_q +
-                                  cpi->oxcf.best_allowed_q) / 2;
-  }
-  rc->last_q[0] = cpi->oxcf.best_allowed_q;
-  rc->last_q[1] = cpi->oxcf.best_allowed_q;
-  rc->last_q[2] = cpi->oxcf.best_allowed_q;
-
-  // Initialise the starting buffer levels
-  rc->buffer_level    = cpi->oxcf.starting_buffer_level;
-  rc->bits_off_target = cpi->oxcf.starting_buffer_level;
-
-  rc->rolling_target_bits      = rc->av_per_frame_bandwidth;
-  rc->rolling_actual_bits      = rc->av_per_frame_bandwidth;
-  rc->long_rolling_target_bits = rc->av_per_frame_bandwidth;
-  rc->long_rolling_actual_bits = rc->av_per_frame_bandwidth;
-
-  rc->total_actual_bits = 0;
-  rc->total_target_vs_actual = 0;
-
   cpi->static_mb_pct = 0;
 
   cpi->lst_fb_idx = 0;
@@ -1272,15 +1273,11 @@
     cpi->fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
+void vp9_change_config(struct VP9_COMP *cpi, const VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
-  if (!cpi || !oxcf)
-    return;
-
-  if (cm->version != oxcf->version) {
+  if (cm->version != oxcf->version)
     cm->version = oxcf->version;
-  }
 
   cpi->oxcf = *oxcf;
 
@@ -1631,6 +1628,7 @@
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
+  init_rate_control(&cpi->oxcf, cpi->pass, &cpi->rc);
   init_pick_mode_context(cpi);
 
   cm->current_video_frame = 0;
@@ -2518,7 +2516,7 @@
 
     vpx_usec_timer_start(&timer);
 
-    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
+    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 2aa67ae..022a63f 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -23,7 +23,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-#include "vp9/encoder/vp9_craq.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_lookahead.h"
@@ -210,6 +210,15 @@
   VAR_BASED_PARTITION
 } PARTITION_SEARCH_TYPE;
 
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+} LPF_PICK_METHOD;
+
 typedef struct {
   // Frame level coding parameter update
   int frame_parameter_update;
@@ -380,11 +389,8 @@
   // final encode.
   int use_uv_intra_rd_estimate;
 
-  // This feature controls how the loop filter level is determined:
-  // 0: Try the full image with different values.
-  // 1: Try a small portion of the image with different values.
-  // 2: Estimate the level based on quantizer and frame type
-  int use_fast_lpf_pick;
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
 
   // This feature limits the number of coefficients updates we actually do
   // by only looking at counts from 1/2 the bands.
@@ -774,7 +780,6 @@
   unsigned int activity_avg;
   unsigned int *mb_activity_map;
   int *mb_norm_activity_map;
-  int output_partition;
 
   // Force next frame to intra when kf_auto says so.
   int force_next_frame_intra;
@@ -820,7 +825,7 @@
 struct VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf);
 void vp9_remove_compressor(VP9_COMP *cpi);
 
-void vp9_change_config(VP9_COMP *cpi, VP9_CONFIG *oxcf);
+void vp9_change_config(VP9_COMP *cpi, const VP9_CONFIG *oxcf);
 
   // receive a frames worth of data. caller can assume that a copy of this
   // frame is made and not just a copy of the pointer..
@@ -869,8 +874,8 @@
 
 int vp9_get_quantizer(struct VP9_COMP *cpi);
 
-static int get_ref_frame_idx(const VP9_COMP *cpi,
-                             MV_REFERENCE_FRAME ref_frame) {
+static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
+                                    MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
     return cpi->lst_fb_idx;
   } else if (ref_frame == GOLDEN_FRAME) {
@@ -880,15 +885,25 @@
   }
 }
 
-static YV12_BUFFER_CONFIG *get_ref_frame_buffer(VP9_COMP *cpi,
-                                                MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON *const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi,
-                                                             ref_frame)]].buf;
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP9_COMMON * const cm = &cpi->common;
+  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+      .buf;
 }
 
 void vp9_set_speed_features(VP9_COMP *cpi);
 
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+  // TODO(JBB): make this work for alpha channel and double check we can't
+  // exceed this token count if we have a 32x32 transform crossing a boundary
+  // at a multiple of 16.
+  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+  // resolution. We assume up to 1 token per pixel, and then allow
+  // a head room of 4.
+  return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+}
+
 int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
                     const YV12_BUFFER_CONFIG *reference);
 
@@ -903,16 +918,13 @@
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
-static int get_token_alloc(int mb_rows, int mb_cols) {
-  return mb_rows * mb_cols * (48 * 16 + 4);
-}
-
 extern const int q_trans[];
 
 int64_t vp9_rescale(int64_t val, int64_t num, int denom);
 
-static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
-                         MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
+static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
   xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
                                                          : 0];
   xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index b5f4901..92ad1e7 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -10,16 +10,18 @@
 
 #include <assert.h>
 #include <limits.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "./vpx_scale_rtcd.h"
 
 static int get_max_filter_level(VP9_COMP *cpi) {
   return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
@@ -28,11 +30,11 @@
 
 
 static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
-                            MACROBLOCKD *const xd, VP9_COMMON *const cm,
                             int filt_level, int partial_frame) {
+  VP9_COMMON *const cm = &cpi->common;
   int filt_err;
 
-  vp9_loop_filter_frame(cm, xd, filt_level, 1, partial_frame);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_level, 1, partial_frame);
   filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
   // Re-instate the unfiltered frame
@@ -43,7 +45,6 @@
 
 static void search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
                                 int partial_frame) {
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
@@ -64,7 +65,7 @@
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
-  best_err = try_filter_frame(sd, cpi, xd, cm, filt_mid, partial_frame);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -86,7 +87,7 @@
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_low, partial_frame);
+        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
         ss_err[filt_low] = filt_err;
       } else {
         filt_err = ss_err[filt_low];
@@ -105,7 +106,7 @@
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-        filt_err = try_filter_frame(sd, cpi, xd, cm, filt_high, partial_frame);
+        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
         ss_err[filt_high] = filt_err;
       } else {
         filt_err = ss_err[filt_high];
@@ -119,7 +120,7 @@
 
     // Half the step distance if the best filter value was the same as last time
     if (filt_best == filt_mid) {
-      filter_step = filter_step / 2;
+      filter_step /= 2;
       filt_direction = 0;
     } else {
       filt_direction = (filt_best < filt_mid) ? -1 : 1;
@@ -131,25 +132,24 @@
 }
 
 void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
-                           int method) {
+                           LPF_PICK_METHOD method) {
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
 
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.sharpness;
 
-  if (method == 2) {
+  if (method == LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = get_max_filter_level(cpi);
     const int q = vp9_ac_quant(cm->base_qindex, 0);
     // These values were determined by linear fitting the result of the
-    // searched level
-    // filt_guess = q * 0.316206 + 3.87252
-    int filt_guess = (q * 20723 + 1015158 + (1 << 17)) >> 18;
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    search_filter_level(sd, cpi, method == 1);
+    search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
   }
 }
diff --git a/vp9/encoder/vp9_picklpf.h b/vp9/encoder/vp9_picklpf.h
index 203ef87..7d08ddb 100644
--- a/vp9/encoder/vp9_picklpf.h
+++ b/vp9/encoder/vp9_picklpf.h
@@ -16,11 +16,13 @@
 extern "C" {
 #endif
 
+#include "vp9/encoder/vp9_onyx_int.h"
+
 struct yv12_buffer_config;
 struct VP9_COMP;
 
 void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
-                           struct VP9_COMP *cpi, int method);
+                           struct VP9_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 8b2a894..01bcbca 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -230,6 +230,12 @@
                                            intra_cost_penalty, 0);
   const int64_t intra_mode_cost = 50;
 
+  unsigned char segment_id = mbmi->segment_id;
+  const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
+  // Mode index conversion form THR_MODES to MB_PREDICTION_MODE for a ref frame.
+  int mode_idx[MB_MODE_COUNT] = {0};
+
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   x->skip = 0;
@@ -273,6 +279,14 @@
 
     mbmi->ref_frame[0] = ref_frame;
 
+    // Set conversion index for LAST_FRAME.
+    if (ref_frame == LAST_FRAME) {
+      mode_idx[NEARESTMV] = THR_NEARESTMV;   // LAST_FRAME, NEARESTMV
+      mode_idx[NEARMV] = THR_NEARMV;         // LAST_FRAME, NEARMV
+      mode_idx[ZEROMV] = THR_ZEROMV;         // LAST_FRAME, ZEROMV
+      mode_idx[NEWMV] = THR_NEWMV;           // LAST_FRAME, NEWMV
+    }
+
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
@@ -280,6 +294,11 @@
           (1 << INTER_OFFSET(this_mode)))
         continue;
 
+      if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] *
+          rd_thresh_freq_fact[this_mode] >> 5) ||
+          rd_threshes[mode_idx[this_mode]] == INT_MAX)
+        continue;
+
       if (this_mode == NEWMV) {
         if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index 58c5df4..9d8da0d 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -44,7 +44,7 @@
                                       const uint8_t *second_pred, \
                                       unsigned int max_sad) { \
   uint8_t comp_pred[m * n]; \
-  comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+  vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
   return sad(src_ptr, src_stride, comp_pred, m, m, n); \
 }
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index eba7bc6..4ccc2bd 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -15,13 +15,14 @@
 
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int temporal_layer = 0;
+  int layer;
+
   cpi->svc.spatial_layer_id = 0;
   cpi->svc.temporal_layer_id = 0;
-  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
-      ++temporal_layer) {
-    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+  for (layer = 0; layer < cpi->svc.number_temporal_layers; ++layer) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
+
     lrc->avg_frame_qindex[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
     lrc->last_q[INTER_FRAME] = q_trans[oxcf->worst_allowed_q];
     lrc->ni_av_qi = q_trans[oxcf->worst_allowed_q];
@@ -35,11 +36,9 @@
     lrc->decimation_factor = 0;
     lrc->rate_correction_factor = 1.0;
     lrc->key_frame_rate_correction_factor = 1.0;
-    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] *
-        1000;
-    lrc->buffer_level =
-        vp9_rescale((int)(oxcf->starting_buffer_level),
-                    lc->target_bandwidth, 1000);
+    lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+                                    lc->target_bandwidth, 1000);
     lrc->bits_off_target = lrc->buffer_level;
   }
 }
@@ -49,14 +48,14 @@
                                             const int target_bandwidth) {
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
   const RATE_CONTROL *const rc = &cpi->rc;
-  int temporal_layer = 0;
+  int layer;
   float bitrate_alloc = 1.0;
-  for (temporal_layer = 0; temporal_layer < cpi->svc.number_temporal_layers;
-      ++temporal_layer) {
-    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+
+  for (layer = 0; layer < cpi->svc.number_temporal_layers; ++layer) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
-    lc->target_bandwidth = oxcf->ts_target_bitrate[temporal_layer] * 1000;
-    bitrate_alloc = (float)lc->target_bandwidth / (float)target_bandwidth;
+    lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+    bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
     lc->starting_buffer_level =
         (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
@@ -67,7 +66,7 @@
     lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
     // Update framerate-related quantities.
-    lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+    lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
     lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
     lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
     // Update qp-related quantities.
@@ -76,22 +75,27 @@
   }
 }
 
+static LAYER_CONTEXT *get_temporal_layer_context(SVC *svc) {
+  return &svc->layer_context[svc->temporal_layer_id];
+}
+
 void vp9_update_layer_framerate(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
+  const int layer = cpi->svc.temporal_layer_id;
   const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  LAYER_CONTEXT *const lc = &cpi->svc.layer_context[temporal_layer];
+  LAYER_CONTEXT *const lc = get_temporal_layer_context(&cpi->svc);
   RATE_CONTROL *const lrc = &lc->rc;
-  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[temporal_layer];
+
+  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
   lrc->av_per_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
-  if (temporal_layer == 0) {
+  if (layer == 0) {
     lc->avg_frame_size = lrc->av_per_frame_bandwidth;
   } else {
-    double prev_layer_framerate = oxcf->framerate /
-        oxcf->ts_rate_decimator[temporal_layer - 1];
-    int prev_layer_target_bandwidth =
-        oxcf->ts_target_bitrate[temporal_layer - 1] * 1000;
+    const double prev_layer_framerate =
+        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+    const int prev_layer_target_bandwidth =
+        oxcf->ts_target_bitrate[layer - 1] * 1000;
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
               (lc->framerate - prev_layer_framerate));
@@ -99,10 +103,10 @@
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
-  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
-  int frame_since_key = cpi->rc.frames_since_key;
-  int frame_to_key = cpi->rc.frames_to_key;
+  LAYER_CONTEXT *const lc = get_temporal_layer_context(&cpi->svc);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+
   cpi->rc = lc->rc;
   cpi->oxcf.target_bandwidth = lc->target_bandwidth;
   cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
@@ -111,17 +115,18 @@
   cpi->output_framerate = lc->framerate;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
-  cpi->rc.frames_since_key = frame_since_key;
-  cpi->rc.frames_to_key = frame_to_key;
+  cpi->rc.frames_since_key = old_frame_since_key;
+  cpi->rc.frames_to_key = old_frame_to_key;
 }
 
 void vp9_save_layer_context(VP9_COMP *const cpi) {
-  int temporal_layer = cpi->svc.temporal_layer_id;
-  LAYER_CONTEXT *lc = &cpi->svc.layer_context[temporal_layer];
+  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_temporal_layer_context(&cpi->svc);
+
   lc->rc = cpi->rc;
-  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
-  lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
-  lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
-  lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
+  lc->target_bandwidth = (int)oxcf->target_bandwidth;
+  lc->starting_buffer_level = oxcf->starting_buffer_level;
+  lc->optimal_buffer_level = oxcf->optimal_buffer_level;
+  lc->maximum_buffer_size = oxcf->maximum_buffer_size;
   lc->framerate = cpi->output_framerate;
 }
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 8bc3850..996f730 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -216,7 +216,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 64, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  vp9_comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
   return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -273,7 +273,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
   return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -330,7 +330,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
   return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -387,7 +387,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
   return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -614,7 +614,7 @@
 
   // Now filter Verticaly
   var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  vp9_comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
   return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -658,7 +658,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
   return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -703,7 +703,7 @@
                                     1, 17, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
 
-  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
   return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -747,7 +747,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 65, 64, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  vp9_comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
   return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -791,7 +791,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 33, 32, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  vp9_comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
   return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -955,7 +955,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 16, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  vp9_comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
   return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -999,7 +999,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 17, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
   return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -1043,7 +1043,7 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 5, 8, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
+  vp9_comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
   return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
@@ -1089,6 +1089,23 @@
   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
                                     1, 9, 4, hfilter);
   var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
+  vp9_comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
   return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
 }
+
+
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 3bc2091..62e20dc 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -100,21 +100,9 @@
   vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
-static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride);
 
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp;
-      tmp = pred[j] + ref[j];
-      comp_pred[j] = (tmp + 1) >> 1;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vp9/encoder/vp9_write_bit_buffer.c
new file mode 100644
index 0000000..962d0ca
--- /dev/null
+++ b/vp9/encoder/vp9_write_bit_buffer.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT -1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 1795e05..073608d 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -24,29 +24,11 @@
   size_t bit_offset;
 };
 
-static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
-  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb);
 
-static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = (int)wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT -1) {
-    wb->bit_buffer[p] = bit << q;
-  } else {
-    wb->bit_buffer[p] &= ~(1 << q);
-    wb->bit_buffer[p] |= bit << q;
-  }
-  wb->bit_offset = off + 1;
-}
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
 
-static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
-                              int data, int bits) {
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
-}
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
 
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
new file mode 100644
index 0000000..f31b176
--- /dev/null
+++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "vpx/vpx_integer.h"
+
+void vp9_sad32x32x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 32 ; i++) {
+    // load src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
+
+void vp9_sad64x64x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+  __m256i ref3_reg, ref3next_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 64 ; i++) {
+    // load 64 bytes from src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 650d9ba..30c2c49 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -803,48 +803,25 @@
         if (cpi->droppable)
           pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
 
-        /*if (cpi->output_partition)
-        {
-            int i;
-            const int num_partitions = 1;
-
-            pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
-
-            for (i = 0; i < num_partitions; ++i)
-            {
-                pkt.data.frame.buf = cx_data;
-                pkt.data.frame.sz = cpi->partition_sz[i];
-                pkt.data.frame.partition_id = i;
-                // don't set the fragment bit for the last partition
-                if (i == (num_partitions - 1))
-                    pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
-                vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-                cx_data += cpi->partition_sz[i];
-                cx_data_sz -= cpi->partition_sz[i];
-            }
+        if (ctx->pending_cx_data) {
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          ctx->pending_frame_magnitude |= size;
+          ctx->pending_cx_data_sz += size;
+          size += write_superframe_index(ctx);
+          pkt.data.frame.buf = ctx->pending_cx_data;
+          pkt.data.frame.sz  = ctx->pending_cx_data_sz;
+          ctx->pending_cx_data = NULL;
+          ctx->pending_cx_data_sz = 0;
+          ctx->pending_frame_count = 0;
+          ctx->pending_frame_magnitude = 0;
+        } else {
+          pkt.data.frame.buf = cx_data;
+          pkt.data.frame.sz  = size;
         }
-        else*/
-        {
-          if (ctx->pending_cx_data) {
-            ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-            ctx->pending_frame_magnitude |= size;
-            ctx->pending_cx_data_sz += size;
-            size += write_superframe_index(ctx);
-            pkt.data.frame.buf = ctx->pending_cx_data;
-            pkt.data.frame.sz  = ctx->pending_cx_data_sz;
-            ctx->pending_cx_data = NULL;
-            ctx->pending_cx_data_sz = 0;
-            ctx->pending_frame_count = 0;
-            ctx->pending_frame_magnitude = 0;
-          } else {
-            pkt.data.frame.buf = cx_data;
-            pkt.data.frame.sz  = size;
-          }
-          pkt.data.frame.partition_id = -1;
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-          cx_data += size;
-          cx_data_sz -= size;
-        }
+        pkt.data.frame.partition_id = -1;
+        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+        cx_data += size;
+        cx_data_sz -= size;
       }
     }
   }
@@ -1172,8 +1149,7 @@
 CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   "WebM Project VP9 Encoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
-  VPX_CODEC_CAP_OUTPUT_PARTITION,
+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
   /* vpx_codec_caps_t          caps; */
   vp9e_init,          /* vpx_codec_init_fn_t       init; */
   vp9e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 72701d9..4c9350e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -280,7 +280,6 @@
       oxcf.width = ctx->si.w;
       oxcf.height = ctx->si.h;
       oxcf.version = 9;
-      oxcf.postprocess = 0;
       oxcf.max_threads = ctx->cfg.threads;
       oxcf.inv_tile_order = ctx->invert_tile_order;
       optr = vp9_create_decompressor(&oxcf);
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index b14e7e5..025e126 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -30,6 +30,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_writer.h
 VP9_CX_SRCS-yes += encoder/vp9_writer.c
+VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.c
 VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
@@ -68,10 +69,10 @@
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance.c
-VP9_CX_SRCS-yes += encoder/vp9_vaq.c
-VP9_CX_SRCS-yes += encoder/vp9_vaq.h
-VP9_CX_SRCS-yes += encoder/vp9_craq.c
-VP9_CX_SRCS-yes += encoder/vp9_craq.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
@@ -89,6 +90,7 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index d48a761..d4f4e9f 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -583,8 +583,12 @@
   enc_cfg->rc_dropframe_thresh = 0;
   enc_cfg->rc_end_usage = VPX_CBR;
   enc_cfg->rc_resize_allowed = 0;
-  enc_cfg->rc_min_quantizer = 33;
-  enc_cfg->rc_max_quantizer = 33;
+
+  if (enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+    enc_cfg->rc_min_quantizer = 33;
+    enc_cfg->rc_max_quantizer = 33;
+  }
+
   enc_cfg->rc_undershoot_pct = 100;
   enc_cfg->rc_overshoot_pct = 15;
   enc_cfg->rc_buf_initial_sz = 500;
@@ -784,12 +788,17 @@
   }
   layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
 
-  if (vpx_svc_is_keyframe(svc_ctx)) {
-    svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
-    svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
+  if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
+    if (vpx_svc_is_keyframe(svc_ctx)) {
+      svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
+      svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
+    } else {
+      svc_params.min_quantizer = si->quantizer[layer_index];
+      svc_params.max_quantizer = si->quantizer[layer_index];
+    }
   } else {
-    svc_params.min_quantizer = si->quantizer[layer_index];
-    svc_params.max_quantizer = si->quantizer[layer_index];
+    svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
+    svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
   }
 
   svc_params.distance_from_i_frame = si->frame_within_gop;