Merge "A few more optimizations, about 1% overall speedup." into experimental
diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c
index e04922f..1664b28 100644
--- a/vp8/common/entropymv.c
+++ b/vp8/common/entropymv.c
@@ -117,6 +117,8 @@
   if (tot) {
     const vp8_prob x = ((ct[0] * 255) / tot) & -(1 << (8 - pbits));
     *p = x ? x : 1;
+  } else {
+    *p = 128;
   }
 }
 
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 0d82db7..d73d8fd 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -619,7 +619,8 @@
 /*encoder only*/
 void vp8_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
                                              unsigned char *dst_y,
-                                             int dst_ystride) {
+                                             int dst_ystride,
+                                             int clamp_mvs) {
   unsigned char *ptr_base = xd->pre.y_buffer;
   unsigned char *ptr;
   int pre_stride = xd->block[0].pre_stride;
@@ -627,7 +628,7 @@
 
   ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
 
-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
+  if (clamp_mvs)
     clamp_mv_to_umv_border(&ymv.as_mv, xd);
 
   ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
@@ -755,7 +756,8 @@
                                             unsigned char *dst_u,
                                             unsigned char *dst_v,
                                             int dst_ystride, int dst_uvstride) {
-  vp8_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
+  vp8_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
+      xd->mode_info_context->mbmi.need_to_clamp_mvs);
   vp8_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }
 
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 48d95ff..7ad0adb 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -18,7 +18,8 @@
 
 extern void vp8_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
                                                     unsigned char *dst_y,
-                                                    int dst_ystride);
+                                                    int dst_ystride,
+                                                    int clamp_mvs);
 extern void vp8_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
                                                      unsigned char *dst_u,
                                                      unsigned char *dst_v,
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6ff9148..0ac2365 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -211,10 +211,14 @@
   int tx_type;
 
 #if CONFIG_HYBRIDTRANSFORM
-  int QIndex = xd->q_index;
-  int active_ht = (QIndex < ACTIVE_HT);
+  int QIndex;
+  int active_ht;
 #endif
 
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
   if (pbi->common.frame_type == KEY_FRAME) {
 #if CONFIG_TX16X16
     if (xd->mode_info_context->mbmi.mode <= TM_PRED ||
@@ -326,11 +330,14 @@
   }
 #endif
 
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
+  // moved to be performed before detokenization
+//  if (xd->segmentation_enabled)
+//    mb_init_dequantizer(pbi, xd);
 
 #if CONFIG_HYBRIDTRANSFORM
   // parse transform types for intra 4x4 mode
+  QIndex = xd->q_index;
+  active_ht = (QIndex < ACTIVE_HT);
   if (mode == B_PRED) {
     for (i = 0; i < 16; i++) {
       BLOCKD *b = &xd->block[i];
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index e03b47e..4542444 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -1194,7 +1194,8 @@
   x->e_mbd.mode_info_context->mbmi.pred_filter_enabled = 0;
 #endif
 
-  vp8_build_1st_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+  vp8_build_1st_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor,
+                                          16, 0);
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
 
diff --git a/vp8/encoder/mbgraph.c b/vp8/encoder/mbgraph.c
index f3f7f84..dde000a 100644
--- a/vp8/encoder/mbgraph.c
+++ b/vp8/encoder/mbgraph.c
@@ -88,7 +88,7 @@
 #endif
 
   vp8_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp8_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16);
+  vp8_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
   // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
   best_err = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16)
              (xd->dst.y_buffer, xd->dst.y_stride,
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 720736f..a2c1a23 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -3213,7 +3213,7 @@
       rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
 
       vp8_build_1st_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor,
-                                              16);
+                                              16, 0);
       if (is_comp_pred)
         vp8_build_2nd_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor,
                                                 16);
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index 83b27a6..e9d02cd 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -111,7 +111,8 @@
   int i;
 
   // Blank the strtucture to start with
-  vpx_memset(segment_tree_probs, 0, sizeof(segment_tree_probs));
+  vpx_memset(segment_tree_probs, 0,
+             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
 
   // Total count for all segments
   count1 = segcounts[0] + segcounts[1];