Merge "Add a best_yrd shortcut in splitmv mode search."
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3320a16..3b72129 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -551,7 +551,7 @@
 const ConvolveFunctions convolve8_neon(
     vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
     vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
-    vp9_convolve8_c, vp9_convolve8_avg_c);
+    vp9_convolve8_neon, vp9_convolve8_avg_neon);
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_neon),
diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vp9/common/arm/neon/vp9_convolve_neon.c
new file mode 100644
index 0000000..6e37ff6
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  uint8_t temp[64 * 72];
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+                          dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  uint8_t temp[64 * 72];
+  int intermediate_height = h + 7;
+
+  if (x_step_q4 != 16 || y_step_q4 != 16)
+    return vp9_convolve8_avg_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+                              64, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+}
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index ed6af66..c4bdb6b 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -12,17 +12,12 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-//#define MV_COUNT_TESTING
-
 #define MV_COUNT_SAT 20
 #define MV_MAX_UPDATE_FACTOR 128
 
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH    8
 
-/* Smooth or bias the mv-counts before prob computation */
-/* #define SMOOTH_MV_COUNTS */
-
 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
@@ -160,24 +155,6 @@
   }
 }
 
-#ifdef SMOOTH_MV_COUNTS
-static void smooth_counts(nmv_component_counts *mvcomp) {
-  static const int flen = 3;  // (filter_length + 1) / 2
-  static const int fval[] = {8, 3, 1};
-  static const int fvalbits = 4;
-  int i;
-  unsigned int smvcount[MV_VALS];
-  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
-  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
-  for (i = flen - 1; i <= MV_VALS - flen; ++i) {
-    int j, s = smvcount[i] * fval[0];
-    for (j = 1; j < flen; ++j)
-      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
-    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
-  }
-}
-#endif
-
 static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   int v;
   vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
@@ -187,11 +164,9 @@
   }
 }
 
-void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                       int usehp) {
+void vp9_inc_mv(const MV *mv,  nmv_context_counts *mvctx) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   mvctx->joints[j]++;
-  usehp = usehp && vp9_use_mv_hp(ref);
   if (mv_joint_vertical(j))
     inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
 
@@ -246,108 +221,41 @@
 
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
   int i, j;
-#ifdef MV_COUNT_TESTING
-  printf("joints count: ");
-  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
-  printf("\n"); fflush(stdout);
-  printf("signs count:\n");
-  for (i = 0; i < 2; ++i)
-    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
-  printf("\n"); fflush(stdout);
-  printf("classes count:\n");
+
+  nmv_context *ctx = &cm->fc.nmvc;
+  nmv_context *pre_ctx = &cm->fc.pre_nmvc;
+  nmv_context_counts *cts = &cm->fc.NMVcount;
+
+  vp9_counts_process(cts, usehp);
+
+  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_CLASSES; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
-    printf("\n"); fflush(stdout);
+    adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
+                pre_ctx->comps[i].classes, cts->comps[i].classes);
+    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
+                pre_ctx->comps[i].class0, cts->comps[i].class0);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
+                 cts->comps[i].bits[j]);
   }
-  printf("class0 count:\n");
+
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("bits count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
-                       cm->fc.NMVcount.comps[i].bits[j][1]);
-    printf("\n"); fflush(stdout);
-  }
-  printf("class0_fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      printf("{");
-      for (k = 0; k < 4; ++k)
-        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
-      printf("}, ");
-    }
-    printf("\n"); fflush(stdout);
-  }
-  printf("fp count:\n");
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 4; ++j)
-      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
-    printf("\n"); fflush(stdout);
-  }
-  if (usehp) {
-    printf("class0_hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
-                       cm->fc.NMVcount.comps[i].class0_hp[1]);
-    printf("\n"); fflush(stdout);
-    printf("hp count:\n");
-    for (i = 0; i < 2; ++i)
-      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
-                       cm->fc.NMVcount.comps[i].hp[1]);
-    printf("\n"); fflush(stdout);
-  }
-#endif
-#ifdef SMOOTH_MV_COUNTS
-  smooth_counts(&cm->fc.NMVcount.comps[0]);
-  smooth_counts(&cm->fc.NMVcount.comps[1]);
-#endif
-  vp9_counts_process(&cm->fc.NMVcount, usehp);
+      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
+                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
 
-  adapt_probs(0, vp9_mv_joint_tree,
-              cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints,
-              cm->fc.NMVcount.joints);
+    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
+                cts->comps[i].fp);
+  }
 
-  for (i = 0; i < 2; ++i) {
-    adapt_prob(&cm->fc.nmvc.comps[i].sign,
-               cm->fc.pre_nmvc.comps[i].sign,
-               cm->fc.NMVcount.comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree,
-                cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes,
-                cm->fc.NMVcount.comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree,
-                cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0,
-                cm->fc.NMVcount.comps[i].class0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
-                 cm->fc.pre_nmvc.comps[i].bits[j],
-                 cm->fc.NMVcount.comps[i].bits[j]);
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      adapt_probs(0, vp9_mv_fp_tree,
-                  cm->fc.nmvc.comps[i].class0_fp[j],
-                  cm->fc.pre_nmvc.comps[i].class0_fp[j],
-                  cm->fc.NMVcount.comps[i].class0_fp[j]);
-    }
-    adapt_probs(0, vp9_mv_fp_tree,
-                cm->fc.nmvc.comps[i].fp,
-                cm->fc.pre_nmvc.comps[i].fp,
-                cm->fc.NMVcount.comps[i].fp);
-  }
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
-                 cm->fc.pre_nmvc.comps[i].class0_hp,
-                 cm->fc.NMVcount.comps[i].class0_hp);
-      adapt_prob(&cm->fc.nmvc.comps[i].hp,
-                 cm->fc.pre_nmvc.comps[i].hp,
-                 cm->fc.NMVcount.comps[i].hp);
+      adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
+                 cts->comps[i].class0_hp);
+      adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
     }
   }
 }
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 895df30..d7d6576 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -125,8 +125,7 @@
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_inc_mv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
-                int usehp);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
 
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
 
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index abde63d..3103be2 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -124,9 +124,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j];
-    vp9_idct4_1d(temp_in, outptr);
+    vp9_idct4_1d(input, outptr);
     input += 4;
     outptr += 4;
   }
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 5c8e346..3d33dbd 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -367,10 +367,9 @@
   return pred_context;
 }
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_tx_size(const VP9_COMMON *cm,
-                                           const MACROBLOCKD *xd) {
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
   const MODE_INFO *const mi = xd->mode_info_context;
-  const MODE_INFO *const above_mi = mi - cm->mode_info_stride;
+  const MODE_INFO *const above_mi = mi - xd->mode_info_stride;
   const MODE_INFO *const left_mi = mi - 1;
   const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;
   const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 7fc9a1c..cb4c1d3 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -110,19 +110,18 @@
   return cm->fc.single_ref_prob[pred_context][1];
 }
 
-unsigned char vp9_get_pred_context_tx_size(const VP9_COMMON *cm,
-                                           const MACROBLOCKD *xd);
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
-static INLINE const vp9_prob *vp9_get_pred_probs_tx_size(const VP9_COMMON *cm,
-                                                         const MACROBLOCKD * xd) {
+static const vp9_prob *vp9_get_pred_probs_tx_size(const MACROBLOCKD *xd,
+                           const struct tx_probs *tx_probs) {
   const MODE_INFO *const mi = xd->mode_info_context;
-  const int pred_context = vp9_get_pred_context_tx_size(cm, xd);
+  const int pred_context = vp9_get_pred_context_tx_size(xd);
   if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-    return cm->fc.tx_probs.p8x8[pred_context];
+    return tx_probs->p8x8[pred_context];
   else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
-    return cm->fc.tx_probs.p16x16[pred_context];
+    return tx_probs->p16x16[pred_context];
   else
-    return cm->fc.tx_probs.p32x32[pred_context];
+    return tx_probs->p32x32[pred_context];
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 812b015..c36efbd 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -271,7 +271,7 @@
 specialize vp9_convolve_avg sse2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3
+specialize vp9_convolve8 ssse3 neon
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_horiz ssse3 neon
@@ -280,7 +280,7 @@
 specialize vp9_convolve8_vert ssse3 neon
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3
+specialize vp9_convolve8_avg ssse3 neon
 
 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_horiz ssse3 neon
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 5fb572e..88ede1a 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -26,16 +26,6 @@
 #include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_treereader.h"
 
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
-
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
-
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
 }
@@ -50,8 +40,8 @@
 
 static TX_SIZE read_selected_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
                                        BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
-  const int context = vp9_get_pred_context_tx_size(cm, xd);
-  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(cm, xd);
+  const int context = vp9_get_pred_context_tx_size(xd);
+  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(xd, &cm->fc.tx_probs);
   TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
   if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
     txfm_size += vp9_read(r, tx_probs[1]);
@@ -255,7 +245,7 @@
   if (mv_joint_horizontal(j))
     diff.col = read_mv_component(r, &ctx->comps[1], usehp);
 
-  vp9_inc_mv(&diff, ref, counts, usehp);
+  vp9_inc_mv(&diff, counts);
 
   mv->row = ref->row + diff.row;
   mv->col = ref->col + diff.col;
@@ -486,11 +476,6 @@
     ref0 = mbmi->ref_frame[0];
     ref1 = mbmi->ref_frame[1];
 
-#ifdef DEC_DEBUG
-    if (dec_debug)
-      printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
-             xd->mode_info_context->mbmi.mv[0].as_mv.col);
-#endif
     vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
                      ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
 
@@ -510,13 +495,6 @@
       best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
     }
 
-#ifdef DEC_DEBUG
-    if (dec_debug)
-      printf("[D %d %d] %d %d %d %d\n", ref_frame,
-             mbmi->mb_mode_context[ref_frame],
-             mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
-#endif
-
     mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
                               ? read_switchable_filter_type(pbi, r)
                               : cm->mcomp_filter_type;
@@ -645,6 +623,31 @@
   }
 }
 
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+  int i;
+
+  cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
+                                                  : SINGLE_PREDICTION_ONLY;
+
+  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++) {
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    }
+
+  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    for (i = 0; i < REF_CONTEXTS; i++)
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+}
+
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   int k;
@@ -669,31 +672,8 @@
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
-    if (cm->allow_comp_inter_inter) {
-      cm->comp_pred_mode = read_comp_pred_mode(r);
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-            vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
-    } else {
-      cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
-    }
+    read_comp_pred(cm, r);
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++) {
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
-      }
-
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
-          vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
-
-    // VP9_INTRA_MODES
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
         if (vp9_read(r, VP9_MODE_UPDATE_PROB))
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 928fb70..bd2928e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -202,9 +202,9 @@
 
 static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size,
                                      BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
-  const VP9_COMMON *const c = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(c, xd);
+  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(xd, &cm->fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 56390ab..e773c38 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2589,7 +2589,7 @@
         !(mbmi->ref_frame[0] != INTRA_FRAME &&
             (mbmi->mb_skip_coeff ||
              vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
-      const int context = vp9_get_pred_context_tx_size(cm, xd);
+      const int context = vp9_get_pred_context_tx_size(xd);
       if (bsize >= BLOCK_SIZE_SB32X32) {
         cm->fc.tx_counts.p32x32[context][mbmi->txfm_size]++;
       } else if (bsize >= BLOCK_SIZE_MB16X16) {
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 8adad9d..f309c1c 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -510,44 +510,41 @@
 
 void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
                          int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  MV mv;
-  int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl;
-  int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MV diff;
+  const int bw = 1 << b_width_log2(mbmi->sb_type);
+  const int bh = 1 << b_height_log2(mbmi->sb_type);
   int idx, idy;
 
   if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    int i;
     PARTITION_INFO *pi = x->partition_info;
     for (idy = 0; idy < 2; idy += bh) {
       for (idx = 0; idx < 2; idx += bw) {
-        i = idy * 2 + idx;
+        const int i = idy * 2 + idx;
         if (pi->bmi[i].mode == NEWMV) {
-          mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
-          mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
-          vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
-                     x->e_mbd.allow_high_precision_mv);
+          diff.row = pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row;
+          diff.col = pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col;
+          vp9_inc_mv(&diff, &cpi->NMVcount);
+
           if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
-            mv.row = pi->bmi[i].second_mv.as_mv.row -
+            diff.row = pi->bmi[i].second_mv.as_mv.row -
                          second_best_ref_mv->as_mv.row;
-            mv.col = pi->bmi[i].second_mv.as_mv.col -
+            diff.col = pi->bmi[i].second_mv.as_mv.col -
                          second_best_ref_mv->as_mv.col;
-            vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                       x->e_mbd.allow_high_precision_mv);
+            vp9_inc_mv(&diff, &cpi->NMVcount);
           }
         }
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    mv.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
-    mv.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
-    vp9_inc_mv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
-                      x->e_mbd.allow_high_precision_mv);
+    diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
+    diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
+    vp9_inc_mv(&diff, &cpi->NMVcount);
+
     if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      mv.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
-      mv.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
-      vp9_inc_mv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
-                 x->e_mbd.allow_high_precision_mv);
+      diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
+      diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
+      vp9_inc_mv(&diff, &cpi->NMVcount);
     }
   }
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 9042111..2afcd27 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -796,6 +796,8 @@
         sf->skip_encode_sb = 1;
         sf->use_uv_intra_rd_estimate = 1;
         sf->using_small_partition_info = 1;
+        sf->disable_splitmv =
+            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -811,6 +813,7 @@
                                      FLAG_SKIP_COMP_REFMISMATCH;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->disable_splitmv = 1;
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -829,6 +832,8 @@
         sf->optimize_coefficients = 0;
         // sf->reduce_first_step_size = 1;
         // sf->reference_masking = 1;
+
+        sf->disable_splitmv = 1;
       }
       /*
       if (speed == 2) {
@@ -872,8 +877,6 @@
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
 
-  vp9_init_quantizer(cpi);
-
   if (cpi->sf.iterative_sub_pixel == 1) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
   } else if (cpi->sf.quarter_pixel_search) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 7f5f0de..cb7a586 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -53,12 +53,12 @@
 #define SPLITMV 0x10000
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {ZEROMV,    LAST_FRAME,   NONE},
-  {DC_PRED,   INTRA_FRAME,  NONE},
-
   {NEARESTMV, LAST_FRAME,   NONE},
   {NEARMV,    LAST_FRAME,   NONE},
 
+  {ZEROMV,    LAST_FRAME,   NONE},
+  {DC_PRED,   INTRA_FRAME,  NONE},
+
   {ZEROMV,    GOLDEN_FRAME, NONE},
   {NEARESTMV, GOLDEN_FRAME, NONE},
 
@@ -869,7 +869,7 @@
   int n, m;
   int s0, s1;
 
-  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(cm, xd);
+  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(xd, &cm->fc.tx_probs);
 
   for (n = TX_4X4; n <= max_txfm_size; n++) {
     r[n][1] = r[n][0];
@@ -976,7 +976,7 @@
   double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
   // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
 
-  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(cm, xd);
+  const vp9_prob *tx_probs = vp9_get_pred_probs_tx_size(xd, &cm->fc.tx_probs);
 
   // for (n = TX_4X4; n <= max_txfm_size; n++)
   //   r[n][0] = (r[n][0] * scale_r[n]);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 02eb7f6..196846e 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -85,6 +85,7 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)