Merge "Removing experimental code from vp9_entropymv.c."
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3320a16..3b72129 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -551,7 +551,7 @@
 const ConvolveFunctions convolve8_neon(
     vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
     vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_c, vp9_convolve8_avg_c);
+    vp9_convolve8_neon, vp9_convolve8_avg_neon);
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_neon),
diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vp9/common/arm/neon/vp9_convolve_neon.c
new file mode 100644
index 0000000..6e37ff6
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  uint8_t temp[64 * 72];
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+                          dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  uint8_t temp[64 * 72];
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+                              64, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+}
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f56586a..0795975 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -389,9 +389,7 @@
 
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  const TX_SIZE size = mbmi->txfm_size;
-  const TX_SIZE max_size = max_uv_txsize_lookup[mbmi->sb_type];
-  return (size > max_size ? max_size : size);
+  return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
 }
 
 struct plane_block_idx {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 812b015..c36efbd 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -271,7 +271,7 @@
 specialize vp9_convolve_avg sse2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3
+specialize vp9_convolve8 ssse3 neon
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_horiz ssse3 neon
@@ -280,7 +280,7 @@
 specialize vp9_convolve8_vert ssse3 neon
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3
+specialize vp9_convolve8_avg ssse3 neon
 
 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_horiz ssse3 neon
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 5fb572e..a193c9f 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -26,16 +26,6 @@
 #include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_treereader.h"
 
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
-
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
-
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
 }
@@ -486,11 +476,6 @@
     ref0 = mbmi->ref_frame[0];
     ref1 = mbmi->ref_frame[1];
 
-#ifdef DEC_DEBUG
-    if (dec_debug)
-      printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
-             xd->mode_info_context->mbmi.mv[0].as_mv.col);
-#endif
     vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
                      ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
 
@@ -510,13 +495,6 @@
       best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
     }
 
-#ifdef DEC_DEBUG
-    if (dec_debug)
-      printf("[D %d %d] %d %d %d %d\n", ref_frame,
-             mbmi->mb_mode_context[ref_frame],
-             mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
-#endif
-
     mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
                               ? read_switchable_filter_type(pbi, r)
                               : cm->mcomp_filter_type;
@@ -645,6 +623,31 @@
   }
 }
 
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+  int i;
+
+  cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
+                                                  : SINGLE_PREDICTION_ONLY;
+
+  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++) {
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    }
+
+  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+}
+
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   int k;
@@ -669,31 +672,8 @@
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
-    if (cm->allow_comp_inter_inter) {
-      cm->comp_pred_mode = read_comp_pred_mode(r);
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
-    } else {
-      cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
-    }
+    read_comp_pred(cm, r);
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
-      }
-
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
-
-    // VP9_INTRA_MODES
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
         if (vp9_read(r, VP9_MODE_UPDATE_PROB))
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index ae9f0aa..b4c06f5 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -143,6 +143,11 @@
   int rd_search;
   int skip_encode;
 
+  // Used to store sub partition's choices.
+  int fast_ms;
+  int_mv pred_mv;
+  int subblock_ref;
+
   // TODO(jingning): Need to refactor the structure arrays that buffers the
   // coding mode decisions of each partition type.
   PICK_MODE_CONTEXT ab4x4_context[4][4][4];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 48c1b33..3dd235a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1466,6 +1466,138 @@
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
     }
   }
+
+  x->fast_ms = 0;
+  x->pred_mv.as_int = 0;
+  x->subblock_ref = 0;
+
+  // Use 4 subblocks' motion estimation results to speed up current
+  // partition's checking.
+  if (cpi->sf.using_small_partition_info) {
+    // Only use 8x8 result for non HD videos.
+    // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
+    int use_8x8 = 1;
+
+    if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
+        ((use_8x8 && bsize == BLOCK_SIZE_MB16X16) ||
+        bsize == BLOCK_SIZE_SB32X32 || bsize == BLOCK_SIZE_SB64X64)) {
+      int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+
+      if (bsize == BLOCK_SIZE_MB16X16) {
+        ref0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.
+            ref_frame[0];
+        ref1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.
+            ref_frame[0];
+        ref2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.
+            ref_frame[0];
+        ref3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.
+            ref_frame[0];
+      } else if (bsize == BLOCK_SIZE_SB32X32) {
+        ref0 = x->mb_context[xd->sb_index][0].mic.mbmi.ref_frame[0];
+        ref1 = x->mb_context[xd->sb_index][1].mic.mbmi.ref_frame[0];
+        ref2 = x->mb_context[xd->sb_index][2].mic.mbmi.ref_frame[0];
+        ref3 = x->mb_context[xd->sb_index][3].mic.mbmi.ref_frame[0];
+      } else if (bsize == BLOCK_SIZE_SB64X64) {
+        ref0 = x->sb32_context[0].mic.mbmi.ref_frame[0];
+        ref1 = x->sb32_context[1].mic.mbmi.ref_frame[0];
+        ref2 = x->sb32_context[2].mic.mbmi.ref_frame[0];
+        ref3 = x->sb32_context[3].mic.mbmi.ref_frame[0];
+      }
+
+      // Currently, only consider 4 inter ref frames.
+      if (ref0 && ref1 && ref2 && ref3) {
+        int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0,
+            mvr3 = 0, mvc3 = 0;
+        int d01, d23, d02, d13;  // motion vector distance between 2 blocks
+
+        // Get each subblock's motion vectors.
+        if (bsize == BLOCK_SIZE_MB16X16) {
+          mvr0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+              as_mv.col;
+          mvr3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+              as_mv.row;
+          mvc3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+              as_mv.col;
+        } else if (bsize == BLOCK_SIZE_SB32X32) {
+          mvr0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.row;
+          mvc0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.col;
+          mvr1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.row;
+          mvc1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.col;
+          mvr2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.row;
+          mvc2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.col;
+          mvr3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.row;
+          mvc3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.col;
+        } else if (bsize == BLOCK_SIZE_SB64X64) {
+          mvr0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.row;
+          mvc0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.col;
+          mvr1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.row;
+          mvc1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.col;
+          mvr2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.row;
+          mvc2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.col;
+          mvr3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.row;
+          mvc3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.col;
+        }
+
+        // Adjust sign if ref is alt_ref
+        if (cm->ref_frame_sign_bias[ref0]) {
+          mvr0 *= -1;
+          mvc0 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref1]) {
+          mvr1 *= -1;
+          mvc1 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref2]) {
+          mvr2 *= -1;
+          mvc2 *= -1;
+        }
+
+        if (cm->ref_frame_sign_bias[ref3]) {
+          mvr3 *= -1;
+          mvc3 *= -1;
+        }
+
+        // Calculate mv distances.
+        d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
+        d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
+        d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
+        d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
+
+        if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) {
+          // Set fast motion search level.
+          x->fast_ms = 1;
+
+          // Calculate prediction MV
+          x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2;
+          x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2;
+
+          if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
+              d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
+            // Set fast motion search level.
+            x->fast_ms = 2;
+
+            if (!d01 && !d23 && !d02 && !d13) {
+              x->fast_ms = 3;
+              x->subblock_ref = ref0;
+            }
+          }
+        }
+      }
+    }
+  }
+
   if (!cpi->sf.use_partitions_less_than
       || (cpi->sf.use_partitions_less_than
           && bsize <= cpi->sf.less_than_block_size)) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index efee6e6..500bdfe 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -722,7 +722,8 @@
   sf->last_chroma_intra_mode = TM_PRED;
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
-
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->using_small_partition_info = 0;
   // Skip any mode not chosen at size < X for all sizes > X
   // Hence BLOCK_SIZE_SB64X64 (skip is off)
   sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64;
@@ -793,6 +794,10 @@
         sf->last_chroma_intra_mode = DC_PRED;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_uv_intra_rd_estimate = 1;
+        sf->using_small_partition_info = 1;
+        sf->disable_splitmv =
+            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -808,6 +813,7 @@
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->disable_splitmv = 1;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -826,6 +832,8 @@
         sf->optimize_coefficients = 0;
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
+
+        sf->disable_splitmv = 1;
       }
       /*
       if (speed == 2) {
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 7bc757e..19b1e3a 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -268,6 +268,7 @@
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
   int disable_splitmv;
+  int using_small_partition_info;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
@@ -275,6 +276,7 @@
   unsigned int mode_search_skip_flags;
   MB_PREDICTION_MODE last_chroma_intra_mode;
   int use_rd_breakout;
+  int use_uv_intra_rd_estimate;
 } SPEED_FEATURES;
 
 typedef struct VP9_COMP {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 58eeed2..6df1701 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1524,6 +1524,24 @@
   return best_rd;
 }
 
+static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+                              int *rate, int *rate_tokenonly,
+                              int64_t *distortion, int *skippable,
+                              BLOCK_SIZE_TYPE bsize) {
+  int64_t this_rd;
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+  super_block_uvrd(&cpi->common, x, rate_tokenonly,
+                   distortion, skippable, NULL, bsize);
+  *rate = *rate_tokenonly +
+          x->intra_uv_mode_cost[x->e_mbd.frame_type][DC_PRED];
+  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+  return this_rd;
+}
+
 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
@@ -1641,7 +1659,7 @@
                                        int64_t best_yrd,
                                        int i,
                                        int *labelyrate,
-                                       int64_t *distortion,
+                                       int64_t *distortion, int64_t *sse,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
   int k;
@@ -1666,7 +1684,7 @@
   uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
                                                  xd->plane[0].dst.buf,
                                                  xd->plane[0].dst.stride);
-  int64_t thisdistortion = 0;
+  int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
 
   vp9_build_inter_predictor(pre,
@@ -1710,6 +1728,7 @@
       thisdistortion += vp9_block_error(coeff,
                                         BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                      k, 16), 16, &ssz);
+      thissse += ssz;
       thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
                               ta + (k & 1),
                               tl + (k >> 1), TX_4X4, 16);
@@ -1717,6 +1736,7 @@
   }
   *distortion += thisdistortion;
   *labelyrate += thisrate;
+  *sse = thissse >> 2;
 
   *distortion >>= 2;
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
@@ -1729,6 +1749,7 @@
   int64_t segment_rd;
   int r;
   int64_t d;
+  int64_t sse;
   int segment_yrate;
   MB_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
@@ -1777,7 +1798,7 @@
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
   int i, j, br = 0, rate = 0, sbr = 0, idx, idy;
-  int64_t bd = 0, sbd = 0;
+  int64_t bd = 0, sbd = 0, subblock_sse = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   const int label_count = 4;
@@ -1830,7 +1851,7 @@
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         int64_t this_rd;
-        int64_t distortion;
+        int64_t distortion, sse;
         int labelyrate;
         ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
         const struct buf_2d orig_src = x->plane[0].src;
@@ -1957,14 +1978,15 @@
 
         this_rd = encode_inter_mb_segment(cpi, x,
                                           bsi->segment_rd - this_segment_rd,
-                                          i, &labelyrate,
-                                          &distortion, t_above_s, t_left_s);
+                                          i, &labelyrate, &distortion, &sse,
+                                          t_above_s, t_left_s);
         this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
         rate += labelyrate;
 
         if (this_rd < best_label_rd) {
           sbr = rate;
           sbd = distortion;
+          subblock_sse = sse;
           bestlabelyrate = labelyrate;
           mode_selected = this_mode;
           best_label_rd = this_rd;
@@ -1984,6 +2006,7 @@
 
       br += sbr;
       bd += sbd;
+      block_sse += subblock_sse;
       segmentyrate += bestlabelyrate;
       this_segment_rd += best_label_rd;
 
@@ -2007,6 +2030,7 @@
   bsi->d = bd;
   bsi->segment_yrate = segmentyrate;
   bsi->segment_rd = this_segment_rd;
+  bsi->sse = block_sse;
 
   // store everything needed to come back to this!!
   for (i = 0; i < 4; i++) {
@@ -2025,7 +2049,8 @@
                                            int *returntotrate,
                                            int *returnyrate,
                                            int64_t *returndistortion,
-                                           int *skippable, int mvthresh,
+                                           int *skippable, int64_t *psse,
+                                           int mvthresh,
                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
                                            int mi_row, int mi_col) {
   int i;
@@ -2074,6 +2099,7 @@
   *returndistortion = bsi.d;
   *returnyrate = bsi.segment_yrate;
   *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+  *psse = bsi.sse;
   mbmi->mode = bsi.modes[3];
 
   return bsi.segment_rd;
@@ -2316,6 +2342,7 @@
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
@@ -2346,18 +2373,37 @@
 
   vp9_clamp_mv_min_max(x, &ref_mv);
 
-  // Work out the size of the first step in the mv step search.
-  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
-  if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
-    step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude);
-  } else {
-    step_param = vp9_init_search_range(
-                   cpi, MIN(cpi->common.width, cpi->common.height));
-  }
+  // Adjust search parameters based on small partitions' result.
+  if (x->fast_ms) {
+    // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
+    // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
+    // adjust search range
+    step_param = 6;
+    if (x->fast_ms > 1)
+      step_param = 8;
 
-  // mvp_full.as_int = ref_mv[0].as_int;
-  mvp_full.as_int =
-      mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+    // Get prediction MV.
+    mvp_full.as_int = x->pred_mv.as_int;
+
+    // Adjust MV sign if needed.
+    if (cm->ref_frame_sign_bias[ref]) {
+      mvp_full.as_mv.col *= -1;
+      mvp_full.as_mv.row *= -1;
+    }
+  } else {
+    // Work out the size of the first step in the mv step search.
+    // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+    if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+      step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude);
+    } else {
+      step_param = vp9_init_search_range(
+                     cpi, MIN(cpi->common.width, cpi->common.height));
+    }
+
+    // mvp_full.as_int = ref_mv[0].as_int;
+    mvp_full.as_int =
+        mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+  }
 
   mvp_full.as_mv.col >>= 3;
   mvp_full.as_mv.row >>= 3;
@@ -3093,20 +3139,41 @@
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-  if (!cpi->sf.use_avoid_tested_higherror
+
+  // If intra is not masked off then get uv intra mode rd.
+  if (x->fast_ms < 2 && (!cpi->sf.use_avoid_tested_higherror
       || (cpi->sf.use_avoid_tested_higherror
-          && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+          && (ref_frame_mask & (1 << INTRA_FRAME))))) {
+    // Note that the enumerator TXFM_MODE "matches" TX_SIZE.
+    // Eg. ONLY_4X4 = TX_4X4, ALLOW_8X8 = TX_8X8 etc such that the MIN
+    // operation below correctly constrains max_uvtxfm_size.
+    TX_SIZE max_uvtxfm_size =
+      MIN(max_uv_txsize_lookup[bsize], (TX_SIZE)cm->txfm_mode);
+    TX_SIZE min_uvtxfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+                              ? max_uvtxfm_size : TX_4X4;
+
     mbmi->mode = DC_PRED;
     mbmi->ref_frame[0] = INTRA_FRAME;
-    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
-                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
-                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
-         i++) {
+
+    // Test all possible UV transform sizes that may be used in the main loop
+    for (i = min_uvtxfm_size; i <= max_uvtxfm_size; ++i) {
       mbmi->txfm_size = i;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
-                              &dist_uv[i], &skip_uv[i],
-                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
-                                                           bsize);
+
+      // Use an estimated rd for uv_intra based on DC_PRED if the
+      // appropriate speed flag is set.
+      if (cpi->sf.use_uv_intra_rd_estimate) {
+        rd_sbuv_dcpred(cpi, x, &rate_uv_intra[i],
+                       &rate_uv_tokenonly[i], &dist_uv[i], &skip_uv[i],
+                       (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                    bsize);
+      // Else do a proper rd search for each possible transform size that may
+      // be considered in the main rd loop.
+      } else {
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i],
+                                &rate_uv_tokenonly[i], &dist_uv[i], &skip_uv[i],
+                                (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+                                                           : bsize);
+      }
       mode_uv[i] = mbmi->uv_mode;
     }
   }
@@ -3155,6 +3222,12 @@
 
     x->skip = 0;
 
+    // Skip some checking based on small partitions' result.
+    if (x->fast_ms > 1 && !ref_frame)
+      continue;
+    if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
+      continue;
+
     if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
@@ -3330,13 +3403,7 @@
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = mbmi->txfm_size;
-      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
-        uv_tx = TX_4X4;
-      if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
-        uv_tx = TX_8X8;
-      else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
-        uv_tx = TX_16X16;
+      uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
 
       rate_uv = rate_uv_intra[uv_tx];
       distortion_uv = dist_uv[uv_tx];
@@ -3354,7 +3421,7 @@
       int64_t this_rd_thresh;
       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int64_t tmp_best_distortion = INT_MAX;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
       int tmp_best_skippable = 0;
       int switchable_filter_index;
       int_mv *second_ref = is_comp_pred ?
@@ -3397,7 +3464,7 @@
                      second_ref,
                      best_yrd,
                      &rate, &rate_y, &distortion,
-                     &skippable,
+                     &skippable, &total_sse,
                      (int)this_rd_thresh, seg_mvs,
                      mi_row, mi_col);
         if (tmp_rd == INT64_MAX) {
@@ -3423,6 +3490,7 @@
           tmp_best_rate = rate;
           tmp_best_ratey = rate_y;
           tmp_best_distortion = distortion;
+          tmp_best_sse = total_sse;
           tmp_best_skippable = skippable;
           tmp_best_mbmode = *mbmi;
           tmp_best_partition = *x->partition_info;
@@ -3456,7 +3524,7 @@
                      second_ref,
                      best_yrd,
                      &rate, &rate_y, &distortion,
-                     &skippable,
+                     &skippable, &total_sse,
                      (int)this_rd_thresh, seg_mvs,
                      mi_row, mi_col);
         if (tmp_rd == INT64_MAX)
@@ -3467,6 +3535,7 @@
           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
         }
         tmp_rd = tmp_best_rdu;
+        total_sse = tmp_best_sse;
         rate = tmp_best_rate;
         rate_y = tmp_best_ratey;
         distortion = tmp_best_distortion;
@@ -3499,11 +3568,12 @@
                                         BLOCK_SIZE_SB8X8);
         vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
         super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                  &uv_skippable, NULL,
+                                  &uv_skippable, &uv_sse,
                                   BLOCK_SIZE_SB8X8, TX_4X4);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
 
         txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
         for (i = 0; i < NB_TXFM_MODES; ++i)
@@ -3565,7 +3635,7 @@
           }
         }
       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
-                 this_mode != SPLITMV && !xd->lossless) {
+                 !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -3764,6 +3834,20 @@
       break;
   }
 
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+                                                         : bsize);
+    }
+  }
+
   // If indicated then mark the index of the chosen mode to be inspected at
   // other block sizes.
   if (bsize <= cpi->sf.unused_mode_skip_lvl) {
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 02eb7f6..196846e 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -85,6 +85,7 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 60df0af..cc8da2a 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -103,9 +103,9 @@
   const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
   const int c_et = ext_size >> subsampling_y;
   const int c_el = ext_size >> subsampling_x;
-  const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height +
+  const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
                     subsampling_y) >> subsampling_y;
-  const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width +
+  const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
                     subsampling_x) >> subsampling_x;
 
   assert(ybf->y_height - ybf->y_crop_height < 16);