Merge "Add vp10_inv_txfm2d" into nextgenv2
diff --git a/configure b/configure
index 315c427..27d618b 100755
--- a/configure
+++ b/configure
@@ -267,6 +267,7 @@
     fp_mb_stats
     emulate_hardware
     var_tx
+    ref_mv
     ext_tx
     misc_fixes
     ext_intra
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5d59e83..d44a64a 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,9 +191,14 @@
 INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
                 vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
                 vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE && CONFIG_USE_X86INC
 
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
                 vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 3b94cc5..bb94397 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -274,6 +274,8 @@
 };
 
 #if CONFIG_EXT_TX
+#define ALLOW_INTRA_EXT_TX 1
+
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
   1, 17, 10, 2
 };
@@ -368,7 +370,8 @@
       return DCT_DCT;
 
 #if CONFIG_EXT_TX
-    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y)
+    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y &&
+        ALLOW_INTRA_EXT_TX)
       return mbmi->tx_type;
 #endif  // CONFIG_EXT_TX
 
@@ -405,8 +408,10 @@
 #endif
     return DCT_DCT;
   if (mbmi->sb_type >= BLOCK_8X8) {
-    if (plane_type == PLANE_TYPE_Y)
+    if (plane_type == PLANE_TYPE_Y) {
+      if (is_inter_block(mbmi) || ALLOW_INTRA_EXT_TX)
       return mbmi->tx_type;
+    }
     if (is_inter_block(mbmi))
       // UV Inter only
       return (mbmi->tx_type == IDTX && tx_size == TX_32X32 ?
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 34d2c73..d66ddd9 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -422,7 +422,8 @@
 #if CONFIG_EXT_TX
     if (get_ext_tx_types(mbmi->tx_size, mbmi->sb_type, 0) > 1 &&
         cm->base_qindex > 0 && !mbmi->skip &&
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+        ALLOW_INTRA_EXT_TX) {
       FRAME_COUNTS *counts = xd->counts;
       int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type, 0);
       if (eset > 0) {
@@ -887,7 +888,7 @@
           if (counts)
             ++counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
         }
-      } else {
+      } else if (ALLOW_INTRA_EXT_TX) {
         if (eset > 0) {
           mbmi->tx_type = vpx_read_tree(r, vp10_ext_tx_intra_tree[eset],
                                         cm->fc->intra_ext_tx_prob[eset]
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 59edec2..a80e5bd 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -693,7 +693,7 @@
         vp10_write_token(w, vp10_ext_tx_inter_tree[eset],
                          cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size],
                          &ext_tx_inter_encodings[eset][mbmi->tx_type]);
-    } else {
+    } else if (ALLOW_INTRA_EXT_TX) {
       if (eset > 0)
         vp10_write_token(
             w, vp10_ext_tx_intra_tree[eset],
@@ -790,7 +790,8 @@
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(mbmi->tx_size, bsize, 0) > 1 &&
       cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      ALLOW_INTRA_EXT_TX) {
     int eset = get_ext_tx_set(mbmi->tx_size, bsize, 0);
     if (eset > 0)
       vp10_write_token(
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index e0912b5..7562b60 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -691,6 +691,10 @@
         if (!ext_tx_used_inter[ext_tx_set][tx_type])
           continue;
       } else {
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_lookup[mbmi->mode])
+            continue;
+        }
         if (!ext_tx_used_intra[ext_tx_set][tx_type])
           continue;
       }
@@ -719,7 +723,7 @@
             r += cpi->inter_tx_type_costs[ext_tx_set]
                                          [mbmi->tx_size][mbmi->tx_type];
         } else {
-          if (ext_tx_set > 0)
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
             r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
                                          [mbmi->mode][mbmi->tx_type];
         }
@@ -754,12 +758,16 @@
   if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1 &&
       !xd->lossless[mbmi->segment_id] && *rate != INT_MAX) {
     int ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
-    if (is_inter)
-      *rate += cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size]
-                                       [mbmi->tx_type];
-    else
-      *rate += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
-                                       [mbmi->mode][mbmi->tx_type];
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *rate += cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                                      [mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *rate +=
+            cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                                 [mbmi->mode][mbmi->tx_type];
+    }
   }
 #endif  // CONFIG_EXT_TX
 }
@@ -849,6 +857,10 @@
         if (!ext_tx_used_inter[ext_tx_set][tx_type])
           continue;
       } else {
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_lookup[mbmi->mode])
+            continue;
+        }
         if (!ext_tx_used_intra[ext_tx_set][tx_type])
           continue;
       }
@@ -874,7 +886,7 @@
             r += cpi->inter_tx_type_costs[ext_tx_set]
                                          [mbmi->tx_size][mbmi->tx_type];
         } else {
-          if (ext_tx_set > 0)
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
             r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
                                          [mbmi->mode][mbmi->tx_type];
         }
@@ -2227,6 +2239,10 @@
       if (!ext_tx_used_inter[ext_tx_set][tx_type])
         continue;
     } else {
+      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_lookup[mbmi->mode])
+          continue;
+      }
       if (!ext_tx_used_intra[ext_tx_set][tx_type])
         continue;
     }
@@ -2251,7 +2267,7 @@
           this_rate += cpi->inter_tx_type_costs[ext_tx_set]
                                        [max_tx_size][mbmi->tx_type];
       } else {
-        if (ext_tx_set > 0)
+        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
           this_rate += cpi->intra_tx_type_costs[ext_tx_set][max_tx_size]
                                        [mbmi->mode][mbmi->tx_type];
       }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f41ee09..890b638 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -314,9 +314,6 @@
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx/;
 
-add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_full_range_search/;
-
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2 msa/;
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index fc34786..ca5d746 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -71,6 +71,8 @@
   int rddiv;
   int rdmult;
   int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 0475883..f9c28f6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3839,6 +3839,10 @@
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     if (cpi->sf.use_nonrd_pick_mode)
@@ -3895,6 +3899,9 @@
   vp9_zero(rdc->coef_counts);
   vp9_zero(rdc->comp_pred_diff);
   vp9_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
+
 
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index eebd7c5..a57cf87 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2995,7 +2995,7 @@
   recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -3004,6 +3004,8 @@
         "%10lf %8u %10"PRId64" %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 975d9f4..f6d8931 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -260,6 +260,8 @@
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index ad25712..1d1926c 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -30,6 +30,10 @@
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index b9a104a..327ac19 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1517,69 +1517,83 @@
 
 #undef CHECK_BETTER
 
-int vp9_full_range_search_c(const MACROBLOCK *x,
-                            const search_site_config *cfg,
-                            MV *ref_mv, MV *best_mv,
-                            int search_param, int sad_per_bit, int *num00,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv) {
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int range = 64;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV fcenter_mv = {center_mv->row, center_mv->col};
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
 
-  // The cfg and search_param parameters are not used in this search variant
-  (void)cfg;
-  (void)search_param;
+  assert(step >= 1);
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  *best_mv = *ref_mv;
-  *num00 = 11;
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
-  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
 
-  for (r = start_row; r <= end_row; ++r) {
-    for (c = start_col; c <= end_col; c += 4) {
-      if (c + 3 <= end_col) {
-        unsigned int sads[4];
-        const uint8_t *addrs[4];
-        for (i = 0; i < 4; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          addrs[i] = get_buf_from_mv(in_what, &mv);
-        }
-
-        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
-
-        for (i = 0; i < 4; ++i) {
-          if (sads[i] < best_sad) {
-            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-            const unsigned int sad = sads[i] +
-                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
           }
         }
       } else {
-        for (i = 0; i < end_col - c; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
             if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
             }
           }
         }
@@ -2011,6 +2025,70 @@
   return bestsme;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
 int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2324,6 +2402,18 @@
   return best_sad;
 }
 
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
@@ -2342,6 +2432,9 @@
     cost_list[4] = INT_MAX;
   }
 
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
   switch (method) {
     case FAST_DIAMOND:
       var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2367,6 +2460,27 @@
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param,
                                1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
       break;
     default:
       assert(0 && "Invalid search method.");
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d78137a..e32130e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1750,8 +1750,9 @@
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
+  SPEED_FEATURES *const sf = &cpi->sf;
   const int has_second_rf = has_second_ref(mbmi);
-  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  const int inter_mode_mask = sf->inter_mode_mask[bsize];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
 
   vp9_zero(*bsi);
@@ -1820,7 +1821,7 @@
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int thissme, bestsme = INT_MAX;
+          int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
@@ -1845,7 +1846,7 @@
             max_mv =
                 VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
 
-          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+          if (sf->mv.auto_mv_step_size && cm->show_frame) {
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
             // the given reference.
@@ -1858,7 +1859,7 @@
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search) {
+          if (sf->adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
             step_param = VPXMAX(step_param, 8);
@@ -1871,31 +1872,10 @@
 
           bestsme = vp9_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
-              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
               &bsi->ref_mv[0]->as_mv, new_mv,
               INT_MAX, 1);
 
-          // Should we do a full search (best quality only)
-          if (cpi->oxcf.mode == BEST) {
-            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, &cpi->fn_ptr[bsize],
-                                           &bsi->ref_mv[0]->as_mv,
-                                           &best_mv->as_mv);
-            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              *new_mv = best_mv->as_mv;
-            } else {
-              // The full search result is actually worse so re-instate the
-              // previous best vector
-              best_mv->as_mv = *new_mv;
-            }
-          }
-
           if (bestsme < INT_MAX) {
             int distortion;
             cpi->find_fractional_mv_step(
@@ -1904,8 +1884,8 @@
                 &bsi->ref_mv[0]->as_mv,
                 cm->allow_high_precision_mv,
                 x->errorperbit, &cpi->fn_ptr[bsize],
-                cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
+                sf->mv.subpel_force_stop,
+                sf->mv.subpel_iters_per_step,
                 cond_cost_list(cpi, cost_list),
                 x->nmvjointcost, x->mvcost,
                 &distortion,
@@ -1916,7 +1896,7 @@
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
           }
 
-          if (cpi->sf.adaptive_motion_search)
+          if (sf->adaptive_motion_search)
             x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
 
           // restore src pointers
@@ -1933,7 +1913,7 @@
             mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
-          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+          if (sf->comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
                                 mi_row, mi_col, seg_mvs[i],
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index a539629..318d810 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -15,6 +15,22 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -259,6 +275,8 @@
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -460,7 +478,6 @@
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
-  sf->adaptive_rd_thresh = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
@@ -516,19 +533,50 @@
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
-  sf->tx_size_search_breakout = 0;
-  sf->partition_search_breakout_dist_thr = 0;
-  sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
 
+  // Some speed-up features even for best quality as minimal impact on quality.
+  sf->adaptive_rd_thresh = 1;
+  sf->tx_size_search_breakout = 1;
+  sf->partition_search_breakout_dist_thr = (1 << 19);
+  sf->partition_search_breakout_rate_thr = 80;
+
   if (oxcf->mode == REALTIME)
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
   else if (oxcf->mode == GOOD)
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
   cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
-                                               : vp9_diamond_search_sad;
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 575e98c..ad7b64a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -195,6 +195,13 @@
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -299,6 +306,18 @@
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
   int schedule_mode_search;
 
   // Allows sub 8x8 modes to use the prediction filter that was determined
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1dfc45c..13da155 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -278,7 +278,8 @@
   cpi->alt_ref_source = lc->alt_ref_source;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
-  if (cpi->svc.number_temporal_layers > 1) {
+  if (cpi->svc.number_temporal_layers > 1 ||
+      cpi->svc.number_spatial_layers > 1) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b369b05..03dcd4e 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -97,7 +97,7 @@
 specialize qw/vpx_ve_predictor_4x4/;
 
 add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 22b5731..04b39a5 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -515,35 +515,34 @@
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movd                  m0, [aboveq]
-  punpcklbw             m2, m1
+  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
   punpcklbw             m0, m1
-  pshufw                m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  add                leftq, 4
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
+  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
+  psrldq                m0, 2
+  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+  movd                  m2, [leftq]
   punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshufw                m2, m2, 0x0
-  pshufw                m3, m3, 0x0
-  paddw                 m2, m0
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  paddw                 m4, m0
   paddw                 m3, m0
-  packuswb              m2, m2
+  packuswb              m4, m4
   packuswb              m3, m3
-  movd      [dstq        ], m2
+  movd      [dstq        ], m4
   movd      [dstq+strideq], m3
   lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m4, m2, 0xaa
+  pshuflw               m3, m2, 0xff
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m4
+  packuswb              m3, m3
+  movd      [dstq        ], m4
+  movd      [dstq+strideq], m3
+  RET
 
 INIT_XMM sse2
 cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left