Merge "Remove unused 16x3/3x16 sad SSE2 functions."
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 16244e0..385dcc1 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -468,8 +468,8 @@
   }
 #if CONFIG_ALPHA
   // TODO(jkoleszar): Using the Y w/h for now
-  mb->plane[3].subsampling_x = 0;
-  mb->plane[3].subsampling_y = 0;
+  xd->plane[3].subsampling_x = 0;
+  xd->plane[3].subsampling_y = 0;
 #endif
 }
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 31cf69a..37bdad2 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -55,32 +55,30 @@
   return data > max ? max : data;
 }
 
-static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
-  if (lossless) {
-    pc->txfm_mode = ONLY_4X4;
-  } else {
-    pc->txfm_mode = vp9_read_literal(r, 2);
-    if (pc->txfm_mode == ALLOW_32X32)
-      pc->txfm_mode += vp9_read_bit(r);
+static TXFM_MODE read_tx_mode(vp9_reader *r) {
+  TXFM_MODE txfm_mode = vp9_read_literal(r, 2);
+  if (txfm_mode == ALLOW_32X32)
+    txfm_mode += vp9_read_bit(r);
+  return txfm_mode;
+}
 
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      int i, j;
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            vp9_diff_update_prob(r, &pc->fc.tx_probs_8x8p[i][j]);
+static void read_tx_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+  int i, j;
 
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            vp9_diff_update_prob(r, &pc->fc.tx_probs_16x16p[i][j]);
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &fc->tx_probs_8x8p[i][j]);
 
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            vp9_diff_update_prob(r, &pc->fc.tx_probs_32x32p[i][j]);
-    }
-  }
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &fc->tx_probs_16x16p[i][j]);
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &fc->tx_probs_32x32p[i][j]);
 }
 
 static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
@@ -370,33 +368,21 @@
 static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,
                                    vp9_reader *r) {
   vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size];
+  int i, j, k, l, m;
 
-  if (vp9_read_bit(r)) {
-    int i, j, k, l, m;
-    for (i = 0; i < BLOCK_TYPES; i++) {
-      for (j = 0; j < REF_TYPES; j++) {
-        for (k = 0; k < COEF_BANDS; k++) {
-          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-            if (l >= 3 && k == 0)
-              continue;
-
-            for (m = 0; m < UNCONSTRAINED_NODES; m++) {
-              vp9_prob *const p = coef_probs[i][j][k][l] + m;
-
-              if (vp9_read(r, VP9_COEF_UPDATE_PROB))
-                vp9_diff_update_prob(r, p);
-            }
-          }
-        }
-      }
-    }
-  }
+  if (vp9_read_bit(r))
+    for (i = 0; i < BLOCK_TYPES; i++)
+      for (j = 0; j < REF_TYPES; j++)
+        for (k = 0; k < COEF_BANDS; k++)
+          for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+            if (k > 0 || l < 3)
+              for (m = 0; m < UNCONSTRAINED_NODES; m++)
+                if (vp9_read(r, VP9_COEF_UPDATE_PROB))
+                  vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
 }
 
-static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) {
-  const TXFM_MODE txfm_mode = pbi->common.txfm_mode;
-  FRAME_CONTEXT *const fc = &pbi->common.fc;
-
+static void read_coef_probs(FRAME_CONTEXT *fc, TXFM_MODE txfm_mode,
+                            vp9_reader *r) {
   read_coef_probs_common(fc, TX_4X4, r);
 
   if (txfm_mode > ONLY_4X4)
@@ -923,6 +909,26 @@
   return vp9_rb_read_literal(rb, 16);
 }
 
+static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
+                                  size_t partition_size) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  vp9_reader r;
+
+  if (vp9_reader_init(&r, data, partition_size))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+
+  cm->txfm_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+  if (cm->txfm_mode == TX_MODE_SELECT)
+    read_tx_probs(&cm->fc, &r);
+  read_coef_probs(&cm->fc, cm->txfm_mode, &r);
+
+  vp9_prepare_read_mode_info(pbi, &r);
+
+  return vp9_reader_has_error(&r);
+}
+
 void vp9_init_dequantizer(VP9_COMMON *pc) {
   int q;
 
@@ -939,7 +945,7 @@
 
 int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   int i;
-  vp9_reader header_bc, residual_bc;
+  vp9_reader residual_bc;
   VP9_COMMON *const pc = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
@@ -973,10 +979,6 @@
   xd->frame_type = pc->frame_type;
   xd->mode_info_stride = pc->mode_info_stride;
 
-  if (vp9_reader_init(&header_bc, data, first_partition_size))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
-
   mb_init_dequantizer(pc, &pbi->mb);  // MB level dequantizer setup
 
   if (!keyframe)
@@ -986,15 +988,13 @@
 
   update_frame_context(&pc->fc);
 
-  setup_txfm_mode(pc, xd->lossless, &header_bc);
-
-  read_coef_probs(pbi, &header_bc);
-
   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
   setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0,
                    NULL, NULL);
   setup_dst_planes(xd, new_fb, 0, 0);
 
+  new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
+
   // Create the segmentation map structure and set to 0
   if (!pc->last_frame_seg_map)
     CHECK_MEM_ERROR(pc, pc->last_frame_seg_map,
@@ -1008,14 +1008,12 @@
 
   set_prev_mi(pc);
 
-  vp9_prepare_read_mode_info(pbi, &header_bc);
-
   decode_tiles(pbi, data, first_partition_size, &residual_bc);
 
   pc->last_width = pc->width;
   pc->last_height = pc->height;
 
-  new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
+  new_fb->corrupted |= xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
     if (keyframe && !new_fb->corrupted)
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9c04d8a..adaf667 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -389,7 +389,7 @@
       THR_TM /*TM_PRED*/,
       THR_B_PRED /*I4X4_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
+    cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
 #endif
   } else {
     // Note how often each mode chosen as best
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index cc3c5c0..67d0c4c 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -720,6 +720,7 @@
   sf->disable_splitmv = 0;
   sf->mode_search_skip_flags = 0;
   sf->last_chroma_intra_mode = TM_PRED;
+  sf->use_rd_breakout = 0;
 
   // Skip any mode not chosen at size < X for all sizes > X
   // Hence BLOCK_SIZE_SB64X64 (skip is off)
@@ -767,6 +768,7 @@
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA;
         sf->last_chroma_intra_mode = H_PRED;
+        sf->use_rd_breakout = 1;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -790,6 +792,7 @@
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->last_chroma_intra_mode = DC_PRED;
+        sf->use_rd_breakout = 1;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -804,6 +807,7 @@
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH;
+        sf->use_rd_breakout = 1;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -818,6 +822,7 @@
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
                                      FLAG_SKIP_COMP_REFMISMATCH;
+        sf->use_rd_breakout = 1;
       }
       /*
       if (speed == 2) {
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 0ecbf35..08f6bb6 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -275,6 +275,7 @@
   // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
   unsigned int mode_search_skip_flags;
   MB_PREDICTION_MODE last_chroma_intra_mode;
+  int use_rd_breakout;
 } SPEED_FEATURES;
 
 enum BlockSize {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6116f3f..4cb38f7 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -473,6 +473,31 @@
   *out_dist_sum = dist_sum << 4;
 }
 
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              int *out_rate_sum, int64_t *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  // TODO(dkovalev) the same code in get_plane_block_size
+  const int bw = plane_block_width(bsize, pd);
+  const int bh = plane_block_height(bsize, pd);
+  const enum BlockSize bs = get_block_size(bw, bh);
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                            pd->dst.buf, pd->dst.stride, &sse);
+  // sse works better than var, since there is no dc prediction used
+  model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+
+  *out_rate_sum = rate;
+  *out_dist_sum = dist << 4;
+}
+
 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
                                  TX_SIZE tx_size,
                                  MACROBLOCK *x, MACROBLOCKD *xd,
@@ -1643,14 +1668,16 @@
   return cost;
 }
 
-static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                        MACROBLOCK *x,
+                                       int64_t best_yrd,
                                        int i,
                                        int *labelyrate,
                                        int64_t *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
   int k;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
   const int bw = plane_block_width(bsize, &xd->plane[0]);
@@ -1672,9 +1699,6 @@
   int64_t thisdistortion = 0;
   int thisrate = 0;
 
-  *labelyrate = 0;
-  *distortion = 0;
-
   vp9_build_inter_predictor(pre,
                             xd->plane[0].pre[0].stride,
                             dst,
@@ -1684,9 +1708,6 @@
                             bw, bh, 0 /* no avg */, &xd->subpix,
                             MV_PRECISION_Q3);
 
-  // TODO(debargha): Make this work properly with the
-  // implicit-compoundinter-weight experiment when implicit
-  // weighting for splitmv modes is turned on.
   if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
     uint8_t* const second_pre =
     raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
@@ -1699,10 +1720,28 @@
                               &xd->subpix, MV_PRECISION_Q3);
   }
 
+  // Turning this section off for now since it hurts quality and does not
+  // improve speed much
+  /*
+  if (cpi->sf.use_rd_breakout &&
+      best_yrd < INT64_MAX) {
+    int64_t thisrd;
+    model_rd_for_sb_y(cpi, bsize, x, xd, &thisrate, &thisdistortion);
+    thisrd = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
+    if (thisrd / 2 > best_yrd) {
+      *distortion = thisdistortion;
+      *labelyrate = thisrate;
+      return thisrd;
+    }
+  }
+  */
+
   vp9_subtract_block(bh, bw, src_diff, 8,
                      src, src_stride,
                      dst, xd->plane[0].dst.stride);
 
+  *labelyrate = 0;
+  *distortion = 0;
   k = i;
   for (idy = 0; idy < bh / 4; ++idy) {
     for (idx = 0; idx < bw / 4; ++idx) {
@@ -1788,7 +1827,7 @@
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   const int label_count = 4;
-  int64_t this_segment_rd = 0, other_segment_rd;
+  int64_t this_segment_rd = 0;
   int label_mv_thresh;
   int segmentyrate = 0;
   int best_eobs[4] = { 0 };
@@ -1811,15 +1850,13 @@
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  other_segment_rd = this_segment_rd;
-
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+      int64_t best_label_rd = INT64_MAX;
       MB_PREDICTION_MODE mode_selected = ZEROMV;
       int bestlabelyrate = 0;
       i = idy * 2 + idx;
@@ -1960,8 +1997,9 @@
             mv_check_bounds(x, &second_mode_mv[this_mode]))
           continue;
 
-        this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, i, &labelyrate,
+        this_rd = encode_inter_mb_segment(cpi, x,
+                                          bsi->segment_rd - this_segment_rd,
+                                          i, &labelyrate,
                                           &distortion, t_above_s, t_left_s);
         this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
         rate += labelyrate;
@@ -1990,7 +2028,11 @@
       bd += sbd;
       segmentyrate += bestlabelyrate;
       this_segment_rd += best_label_rd;
-      other_segment_rd += best_other_rd;
+
+      if (this_segment_rd > bsi->segment_rd) {
+        bsi->segment_rd = INT64_MAX;
+        return;
+      }
 
       for (j = 1; j < bh; ++j)
         vpx_memcpy(&x->partition_info->bmi[i + j * 2],
@@ -2003,33 +2045,31 @@
     }
   } /* for each label */
 
-  if (this_segment_rd < bsi->segment_rd) {
-    bsi->r = br;
-    bsi->d = bd;
-    bsi->segment_yrate = segmentyrate;
-    bsi->segment_rd = this_segment_rd;
+  bsi->r = br;
+  bsi->d = bd;
+  bsi->segment_yrate = segmentyrate;
+  bsi->segment_rd = this_segment_rd;
 
-    // store everything needed to come back to this!!
-    for (i = 0; i < 4; i++) {
-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->ref_frame[1] > 0)
-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
-      bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = best_eobs[i];
-    }
+  // store everything needed to come back to this!!
+  for (i = 0; i < 4; i++) {
+    bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+    if (mbmi->ref_frame[1] > 0)
+      bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
+    bsi->modes[i] = x->partition_info->bmi[i].mode;
+    bsi->eobs[i] = best_eobs[i];
   }
 }
 
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                       int_mv *best_ref_mv,
-                                       int_mv *second_best_ref_mv,
-                                       int64_t best_rd,
-                                       int *returntotrate,
-                                       int *returnyrate,
-                                       int64_t *returndistortion,
-                                       int *skippable, int mvthresh,
-                                       int_mv seg_mvs[4][MAX_REF_FRAMES],
-                                       int mi_row, int mi_col) {
+static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int_mv *best_ref_mv,
+                                           int_mv *second_best_ref_mv,
+                                           int64_t best_rd,
+                                           int *returntotrate,
+                                           int *returnyrate,
+                                           int64_t *returndistortion,
+                                           int *skippable, int mvthresh,
+                                           int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                           int mi_row, int mi_col) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -2078,7 +2118,7 @@
   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
   mbmi->mode = bsi.modes[3];
 
-  return (int)(bsi.segment_rd);
+  return bsi.segment_rd;
 }
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2585,6 +2625,7 @@
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
+  int rs = 0;
 
   switch (this_mode) {
     int rate_mv;
@@ -2658,6 +2699,14 @@
   *rate2 += cost_mv_ref(cpi, this_mode,
                         mbmi->mb_mode_context[mbmi->ref_frame[0]]);
 
+  if (!(*mode_excluded)) {
+    if (is_comp_pred) {
+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+    } else {
+      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+    }
+  }
+
   pred_exists = 0;
   interpolating_intpel_seen = 0;
   // Are all MVs integer pel for Y and UV
@@ -2668,6 +2717,7 @@
         (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
+  *best_filter = EIGHTTAP;
   if (cpi->sf.use_8tap_always) {
     *best_filter = EIGHTTAP;
     vp9_zero(cpi->rd_filter_cache);
@@ -2678,7 +2728,7 @@
 
     cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-      int rs, j;
+      int j;
       int64_t rs_rd;
       const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
       const int is_intpel_interp = intpel_mv &&
@@ -2730,6 +2780,15 @@
           tmp_dist_sum = dist_sum;
         }
       }
+      if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+        if (rd / 2 > ref_best_rd) {
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = orig_dst[i];
+            xd->plane[i].dst.stride = orig_dst_stride[i];
+          }
+          return INT64_MAX;
+        }
+      }
       newbest = i == 0 || rd < best_rd;
 
       if (newbest) {
@@ -2753,11 +2812,11 @@
       xd->plane[i].dst.stride = orig_dst_stride[i];
     }
   }
-
   // Set the appripriate filter
   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
       cm->mcomp_filter_type : *best_filter;
   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+  rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0);
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2773,6 +2832,23 @@
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+    int tmp_rate;
+    int64_t tmp_dist;
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    // if current pred_error modeled rd is substantially more than the best
+    // so far, do not bother doing full rd
+    if (rd / 2 > ref_best_rd) {
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = orig_dst[i];
+        xd->plane[i].dst.stride = orig_dst_stride[i];
+      }
+      return INT64_MAX;
+    }
+  }
+
   if (cpi->common.mcomp_filter_type == SWITCHABLE)
     *rate2 += get_switchable_rate(cm, x);
 
@@ -2817,7 +2893,7 @@
           *distortion = sse + sse2;
           *rate2 = 500;
 
-          // for best_yrd calculation
+          // for best yrd calculation
           *rate_uv = 0;
           *distortion_uv = sse2;
 
@@ -2858,14 +2934,6 @@
     *skippable = skippable_y && skippable_uv;
   }
 
-  if (!(*mode_excluded)) {
-    if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-    } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-    }
-  }
-
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = orig_dst[i];
     xd->plane[i].dst.stride = orig_dst_stride[i];
@@ -2962,6 +3030,7 @@
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
   int64_t best_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
@@ -3357,16 +3426,20 @@
         int newbest, rs;
         int64_t rs_rd;
         mbmi->interp_filter =
-        vp9_switchable_interp[switchable_filter_index];
+            vp9_switchable_interp[switchable_filter_index];
         vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                      &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                     second_ref, INT64_MAX,
+                     second_ref,
+                     best_yrd,
                      &rate, &rate_y, &distortion,
                      &skippable,
                      (int)this_rd_thresh, seg_mvs,
                      mi_row, mi_col);
+        if (tmp_rd == INT64_MAX) {
+          continue;
+        }
         cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
         rs = get_switchable_rate(cm, x);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
@@ -3374,6 +3447,7 @@
             MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
         if (cm->mcomp_filter_type == SWITCHABLE)
           tmp_rd += rs_rd;
+
         newbest = (tmp_rd < tmp_best_rd);
         if (newbest) {
           tmp_best_filter = mbmi->interp_filter;
@@ -3392,8 +3466,21 @@
               for (i = 0; i < 4; i++)
                 tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
               pred_exists = 1;
+              if (switchable_filter_index == 0 &&
+                  cpi->sf.use_rd_breakout &&
+                  best_rd < INT64_MAX) {
+                if (tmp_best_rdu / 2 > best_rd) {
+                  // skip searching the other filters if the first is
+                  // already substantially larger than the best so far
+                  tmp_best_filter = mbmi->interp_filter;
+                  tmp_best_rdu = INT64_MAX;
+                  break;
+                }
+              }
             }
       }  // switchable_filter_index loop
+      if (tmp_best_rdu == INT64_MAX)
+        continue;
 
       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
                              tmp_best_filter : cm->mcomp_filter_type);
@@ -3403,11 +3490,14 @@
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                      &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
-                     second_ref, INT64_MAX,
+                     second_ref,
+                     best_yrd,
                      &rate, &rate_y, &distortion,
                      &skippable,
                      (int)this_rd_thresh, seg_mvs,
                      mi_row, mi_col);
+        if (tmp_rd == INT64_MAX)
+          continue;
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
           int rs = get_switchable_rate(cm, x);
@@ -3430,29 +3520,32 @@
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += get_switchable_rate(cm, x);
 
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                      BLOCK_SIZE_SB8X8);
-      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
-      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);
-      rate2 += rate_uv;
-      distortion2 += distortion_uv;
-      skippable = skippable && uv_skippable;
-
-      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < NB_TXFM_MODES; ++i)
-        txfm_cache[i] = txfm_cache[ONLY_4X4];
-
       if (!mode_excluded) {
         if (is_comp_pred)
           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
         else
           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
       }
-
       compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+
+      if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
+          best_rd) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                        BLOCK_SIZE_SB8X8);
+        vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
+        super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                  &uv_skippable, NULL,
+                                  BLOCK_SIZE_SB8X8, TX_4X4);
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+
+        txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+        for (i = 0; i < NB_TXFM_MODES; ++i)
+          txfm_cache[i] = txfm_cache[ONLY_4X4];
+      }
     } else {
       compmode_cost = vp9_cost_bit(comp_mode_p,
                                    mbmi->ref_frame[1] > INTRA_FRAME);
@@ -3494,7 +3587,7 @@
       if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-        // for best_yrd calculation
+        // for best yrd calculation
         rate_uv = 0;
 
         if (mb_skip_allowed) {
@@ -3592,6 +3685,8 @@
         *returnrate = rate2;
         *returndistortion = distortion2;
         best_rd = this_rd;
+        best_yrd = best_rd -
+                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         best_partition = *x->partition_info;