Merge "mb_lpf_horizontal_edge AVX2 optimization"
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 6d50644..85f4bb6 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -176,14 +176,32 @@
   }
 }
 
-class DatarateTestVP9 : public DatarateTest {
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestVP9() : EncoderTest(GET_PARAM(0)) {}
+
  protected:
   virtual ~DatarateTestVP9() {}
 
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    frame_number_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                     ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
-      encoder->Control(VP8E_SET_CPUUSED, 2);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -205,6 +223,14 @@
       effective_datarate_ = ((bits_total_) / 1000.0) / duration_;
     }
   }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;
+  int64_t bits_total_;
+  double duration_;
+  double effective_datarate_;
+  int set_cpu_used_;
 };
 
 // There is no buffer model/frame dropper in VP9 currently, so for now we
@@ -218,7 +244,7 @@
 
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
-  for (int i = 200; i < 800; i += 200) {
+  for (int i = 150; i < 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -231,5 +257,6 @@
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
-                          ::testing::Values(::libvpx_test::kOnePassGood));
+                          ::testing::Values(::libvpx_test::kOnePassGood),
+                          ::testing::Range(1, 5));
 }  // namespace
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3347b35..21c91d6 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -309,8 +309,8 @@
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
-                                                  [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
   { 34, 3, },
@@ -416,7 +416,7 @@
                       fc->partition_prob[INTER_FRAME][i], 0);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
                         counts->switchable_interp[i],
                         pre_fc->switchable_interp_prob[i],
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ab37b75..ea96555 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,6 +16,7 @@
 
 #define TX_SIZE_CONTEXTS 2
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
 
 // #define MODE_STATS
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a823de8..ba2e9d8 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -43,7 +43,7 @@
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
   vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -62,7 +62,7 @@
   vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
   unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 53b9003..1c96788 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -161,13 +161,12 @@
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                                pd->subsampling_x,
-                                                pd->subsampling_y);
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
 
     uint8_t *pre;
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = {res_mv.row, res_mv.col };
     MV32 scaled_mv;
     int xs, ys;
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 30d5b6d..bf3a101 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -109,7 +109,7 @@
 
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
-  for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
@@ -382,15 +382,11 @@
 static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize, int index) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
   int eobtotal;
 
-  if (less8x8)
-    if (index > 0)
-      return;
-
   set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
   vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
 
@@ -448,54 +444,50 @@
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize, int index) {
+                            vp9_reader* r, BLOCK_SIZE bsize) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+  int ctx;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    const int ctx = partition_plane_context(xd->above_seg_context,
-                                            xd->left_seg_context,
-                                            mi_row, mi_col, bsize);
-    partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
-                               cm->fc.partition_prob[cm->frame_type][ctx], r);
+  ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
+                                mi_row, mi_col, bsize);
+  partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
+                             cm->fc.partition_prob[cm->frame_type][ctx], r);
 
-    if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.partition[ctx][partition];
-  }
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.partition[ctx][partition];
 
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      break;
-    case PARTITION_HORZ:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      if (mi_row + hbs < cm->mi_rows)
-        decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, 1);
-      break;
-    case PARTITION_VERT:
-      decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
-      if (mi_col + hbs < cm->mi_cols)
-        decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, 1);
-      break;
-    case PARTITION_SPLIT: {
-      int n;
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        decode_modes_sb(cm, xd, tile, mi_row + j * hbs, mi_col + i * hbs,
-                        r, subsize, n);
-      }
-    } break;
-    default:
-      assert(!"Invalid partition type");
+  if (subsize < BLOCK_8X8) {
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        break;
+      case PARTITION_HORZ:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        if (mi_row + hbs < cm->mi_rows)
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        break;
+      case PARTITION_VERT:
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        if (mi_col + hbs < cm->mi_cols)
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        break;
+      case PARTITION_SPLIT:
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        break;
+      default:
+        assert(!"Invalid partition type");
+    }
   }
 
   // update partition context
@@ -780,7 +772,7 @@
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, 0);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -935,7 +927,7 @@
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
-                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, 0);
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
   }
   return !tile_data->xd.corrupted;
 }
@@ -963,7 +955,8 @@
 
       vp9_worker_init(worker);
       worker->hook = (VP9WorkerHook)tile_worker_hook;
-      CHECK_MEM_ERROR(cm, worker->data1, vpx_malloc(sizeof(TileWorkerData)));
+      CHECK_MEM_ERROR(cm, worker->data1,
+                      vpx_memalign(32, sizeof(TileWorkerData)));
       CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
       if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c677907..a996e0e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -53,8 +53,7 @@
 int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
 int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
 int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
-                               [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
 void init_tx_count_stats() {
   vp9_zero(tx_count_32x32p_stats);
@@ -87,10 +86,9 @@
 
 static void update_switchable_interp_stats(VP9_COMMON *cm) {
   int i, j;
-  for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
       switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
-    }
 }
 
 void write_tx_count_stats() {
@@ -140,9 +138,9 @@
   fclose(fp);
 
   printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
       "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
     printf("  { ");
     for (j = 0; j < SWITCHABLE_FILTERS; j++) {
       printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -236,17 +234,16 @@
 static void update_switchable_interp_probs(VP9_COMP *const cpi,
                                            vp9_writer* const bc) {
   VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
-                        [SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+  unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
+  vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
   int i, j;
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     vp9_tree_probs_from_distribution(
         vp9_switchable_interp_tree,
         new_prob[j], branch_ct[j],
         cm->counts.switchable_interp[j], 0);
   }
-  for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
       vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
                                 branch_ct[j][i]);
@@ -1142,7 +1139,7 @@
     int i, j, c = 0;
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
-      for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
         count[i] += cm->counts.switchable_interp[j][i];
       c += (count[i] > 0);
     }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index db2564b..583c6c8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -42,7 +42,7 @@
   int comp_pred_diff;
   int single_pred_diff;
   int64_t tx_rd_diff[TX_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
@@ -118,8 +118,7 @@
   unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
-                             [SWITCHABLE_FILTERS];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9408e54..44ade18 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -465,7 +465,7 @@
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
@@ -2279,7 +2279,7 @@
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
     }
 
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
       cpi->rd_filter_threshes[frame_type][i] =
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 91546e8..e52e8ec 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -418,6 +418,7 @@
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
                                                        block);
@@ -429,14 +430,18 @@
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    int x, y;
     pd->eobs[block] = 0;
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+    ctx->ta[plane][x] = 0;
+    ctx->tl[plane][y] = 0;
     return;
   }
 
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
 
   if (x->skip_encode || pd->eobs[block] == 0)
     return;
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index b867d8b..7eb6592 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -36,7 +36,7 @@
                   vp9_kf_uv_mode_prob[INTRA_MODES - 1],
                   vp9_intra_mode_tree);
 
-  for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
                     cm->fc.switchable_interp_prob[i],
                     vp9_switchable_interp_tree);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ad214c7..b664f1e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -2866,7 +2866,7 @@
       cpi->active_best_quality = inter_minq[q];
       // 1-pass: for now, use the average Q for the active_best, if its lower
       // than active_worst.
-      if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality))
+      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
         cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
 #endif
 
@@ -2902,7 +2902,14 @@
   if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+    // If this is the first (key) frame in 1-pass, active best is the user
+    // best-allowed, and leave the top_index to active_worst.
+    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+      *top_index = cpi->oxcf.worst_allowed_q;
+    }
   } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (cpi->active_worst_quality + cpi->active_best_quality) / 2;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 20831be..0498043 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -396,9 +396,9 @@
   // FIXME(rbultje) can this overflow?
   int rd_tx_select_threshes[4][TX_MODES];
 
-  int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
-  int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+  int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
   int RDMULT;
   int RDDIV;
@@ -641,7 +641,7 @@
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-  unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
                                       [SWITCHABLE_FILTERS];
 
   unsigned int tx_stepdown_count[TX_SIZES];
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c134208..f9de78b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -611,7 +611,7 @@
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -1654,6 +1654,7 @@
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1668,8 +1669,8 @@
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
 
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
   v_fn_ptr = &cpi->fn_ptr[bsize];
 
@@ -1747,7 +1748,7 @@
           }
         }
 
-        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
                    sizeof(bsi->rdstat[i][mode_idx].ta));
         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
@@ -1951,6 +1952,13 @@
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
                        sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
               best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1971,7 +1979,11 @@
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2207,7 +2219,7 @@
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                          int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
@@ -2225,7 +2237,7 @@
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
-             sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2271,12 +2283,8 @@
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
 
-  scale[frame_type].x_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].sfc->x_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
-  scale[frame_type].y_offset_q4 =
-      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].sfc->y_scale_fp,
-       REF_SCALE_SHIFT) & 0xf;
+  scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+                                            mi_row * MI_SIZE, mi_col * MI_SIZE);
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
@@ -3119,8 +3127,8 @@
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int j;
   int mode_index, best_mode_index = 0;
@@ -3158,7 +3166,7 @@
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -3547,7 +3555,7 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -3630,7 +3638,7 @@
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -3752,7 +3760,7 @@
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else
@@ -3817,8 +3825,8 @@
   int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
-  int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
   int mode_index, best_mode_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3840,7 +3848,7 @@
   int best_skip2 = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
-  vp9_zero(x->zcoeff_blk);
+  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3855,7 +3863,7 @@
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
-  for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
@@ -4141,8 +4149,10 @@
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
-              for (i = 0; i < 4; i++)
+              for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+              }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
                   cpi->sf.use_rd_breakout &&
@@ -4297,7 +4307,7 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
@@ -4375,7 +4385,7 @@
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         int64_t adj_rd;
         // In cases of poor prediction, filter_cache[] can contain really big
         // values, which actually are bigger than this_rd itself. This can
@@ -4491,7 +4501,7 @@
   }
 
   if (!x->skip) {
-    for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       if (best_filter_rd[i] == INT64_MAX)
         best_filter_diff[i] = 0;
       else