Merge "Compressed/uncompressed frame header changes." into experimental
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 1b7da6c..1ae3586 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -468,7 +468,7 @@
     } else {
       ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
     }
-    return vp9_pt_energy_class[ctx];
+    return ctx;
   }
 };
 
@@ -642,6 +642,17 @@
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
+void vp9_full_to_model_count(unsigned int *model_count,
+                             unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
 void vp9_full_to_model_counts(
     vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
   int i, j, k, l;
@@ -649,19 +660,10 @@
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          int n;
           if (l >= 3 && k == 0)
             continue;
-          model_count[i][j][k][l][ZERO_TOKEN] =
-              full_count[i][j][k][l][ZERO_TOKEN];
-          model_count[i][j][k][l][ONE_TOKEN] =
-              full_count[i][j][k][l][ONE_TOKEN];
-          model_count[i][j][k][l][TWO_TOKEN] =
-              full_count[i][j][k][l][TWO_TOKEN];
-          for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
-            model_count[i][j][k][l][TWO_TOKEN] += full_count[i][j][k][l][n];
-          model_count[i][j][k][l][DCT_EOB_MODEL_TOKEN] =
-              full_count[i][j][k][l][DCT_EOB_TOKEN];
+          vp9_full_to_model_count(model_count[i][j][k][l],
+                                  full_count[i][j][k][l]);
         }
 }
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 5d57f14..e76211a 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -174,6 +174,8 @@
 typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
                                           [PREV_COEF_CONTEXTS]
                                           [UNCONSTRAINED_NODES][2];
+extern void vp9_full_to_model_count(unsigned int *model_count,
+                                    unsigned int *full_count);
 extern void vp9_full_to_model_counts(
     vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
 
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index dd60a76..b1f327b 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -402,7 +402,7 @@
       (block_idx >> bwl) || xd->up_available;
   const int have_left =
       (block_idx & wmask) || xd->left_available;
-  const int have_right = ((block_idx & wmask) != wmask);
+  int have_right = ((block_idx & wmask) != wmask);
   const int txfm_block_size = 4 << tx_size;
 
   assert(bwl >= 0);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 02900c0..890d5d0 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -58,13 +58,15 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 #define INCREMENT_COUNT(token)               \
   do {                                       \
     coef_counts[type][ref][band][pt]         \
                [token >= TWO_TOKEN ?     \
                 (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
                 token]++;     \
-    token_cache[scan[c]] = token; \
+    token_cache[scan[c]] = vp9_pt_energy_class[token]; \
   } while (0)
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b19bf3a..13bfe21 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1047,7 +1047,7 @@
   fclose(f);
 }
 
-static void build_tree_distribution(vp9_coeff_probs *coef_probs,
+static void build_tree_distribution(vp9_coeff_probs_model *coef_probs,
                                     vp9_coeff_count *coef_counts,
                                     unsigned int (*eob_branch_ct)[REF_TYPES]
                                                                  [COEF_BANDS]
@@ -1056,12 +1056,13 @@
                                     VP9_COMP *cpi,
                                     vp9_coeff_accum *context_counters,
 #endif
-                                    vp9_coeff_stats *coef_branch_ct,
+                                    vp9_coeff_stats_model *coef_branch_ct,
                                     int block_types) {
   int i, j, k, l;
 #ifdef ENTROPY_STATS
   int t = 0;
 #endif
+  unsigned int model_counts[UNCONSTRAINED_NODES + 1];
 
   for (i = 0; i < block_types; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
@@ -1069,10 +1070,11 @@
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(vp9_coef_tree,
+          vp9_full_to_model_count(model_counts, coef_counts[i][j][k][l]);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree,
                                            coef_probs[i][j][k][l],
                                            coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l], 0);
+                                           model_counts, 0);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           coef_probs[i][j][k][l][0] =
@@ -1129,9 +1131,9 @@
 #ifdef ENTROPY_STATS
     vp9_coeff_stats *tree_update_hist,
 #endif
-    vp9_coeff_probs *new_frame_coef_probs,
+    vp9_coeff_probs_model *new_frame_coef_probs,
     vp9_coeff_probs_model *old_frame_coef_probs,
-    vp9_coeff_stats *frame_branch_ct,
+    vp9_coeff_stats_model *frame_branch_ct,
     TX_SIZE tx_size) {
   int i, j, k, l, t;
   int update[2] = {0, 0};
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 211eca4..0e9b680 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -117,7 +117,6 @@
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-  int inter_bmode_costs[INTRA_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b7f60b1..755ff21 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -20,6 +20,9 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9_rtcd.h"
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
                         const uint8_t *src_ptr, int src_stride,
@@ -105,7 +108,7 @@
                                      uint8_t *token_cache,
                                      int pad, int l) {
   int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = token;
+  token_cache[scan[idx]] = vp9_pt_energy_class[token];
   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
   token_cache[scan[idx]] = bak;
   return pt;
@@ -189,7 +192,8 @@
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].token;
+    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
+        qcoeff_ptr[scan[i]]].token];
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   for (i = eob; i-- > i0;) {
@@ -610,6 +614,7 @@
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
+  MB_MODE_INFO* const mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -634,9 +639,9 @@
   TX_TYPE tx_type;
   int mode, b_mode;
 
-  mode = plane == 0? xd->mode_info_context->mbmi.mode:
-                     xd->mode_info_context->mbmi.uv_mode;
-  if (bsize <= BLOCK_SIZE_SB8X8 && mode == I4X4_PRED && plane == 0)
+  mode = plane == 0? mbmi->mode: mbmi->uv_mode;
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0 &&
+      mbmi->ref_frame == INTRA_FRAME)
     b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
   else
     b_mode = mode;
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 171b44b..67658f5 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,6 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *x = &c->common;
-  const vp9_tree_p T = vp9_bmode_tree;
   const vp9_tree_p KT = vp9_bmode_tree;
   int i, j;
 
@@ -28,8 +27,6 @@
     }
   }
 
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
-
   // TODO(rbultje) separate tables for superblock costing?
   vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.sb_ymode_prob,
                   vp9_sb_ymode_tree);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 15f9571..24a2acb 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -422,20 +422,20 @@
   nmv_context_counts NMVcount;
 
   vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_4x4[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_4x4[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_8x8[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_8x8[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_16x16[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_16x16[BLOCK_TYPES];
 
   vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs_32x32[BLOCK_TYPES];
+  vp9_coeff_stats_model frame_branch_ct_32x32[BLOCK_TYPES];
 
   int gfu_boost;
   int last_boost;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2eb3f9b..e649192 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -46,6 +46,9 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -366,7 +369,7 @@
 
       if (!c || token_cache[scan[c - 1]])
         cost += vp9_cost_bit(coef_probs[band][pt][0], 1);
-      token_cache[scan[c]] = t;
+      token_cache[scan[c]] = vp9_pt_energy_class[t];
     }
     if (c < seg_eob) {
       if (c)
@@ -611,7 +614,6 @@
     int64_t this_rd;
     int ratey = 0;
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
     if (cm->frame_type == KEY_FRAME)
       rate = bmode_costs[mode];
     else
@@ -653,9 +655,6 @@
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                          block, 16), 16) >> 2;
 
-        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
-                             dst, xd->plane[0].dst.stride);
-
         if (best_tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
                                dst, xd->plane[0].dst.stride, best_tx_type);
@@ -731,7 +730,7 @@
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
-  bmode_costs = mb->inter_bmode_costs;
+  bmode_costs = mb->mbmode_cost[cpi->common.frame_type];
 
   for (idy = 0; idy < 2; idy += bh) {
     for (idx = 0; idx < 2; idx += bw) {
@@ -939,7 +938,7 @@
 }
 
 static int labels2mode(MACROBLOCK *x,
-                       int const *labelings, int which_label,
+                       int const *labelings, int i,
                        MB_PREDICTION_MODE this_mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
@@ -950,7 +949,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
-  int i, cost = 0, thismvcost = 0;
+  int cost = 0, thismvcost = 0;
   int idx, idy;
   int bw = 1 << b_width_log2(mbmi->sb_type);
   int bh = 1 << b_height_log2(mbmi->sb_type);
@@ -958,72 +957,65 @@
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 4; ++i) {
-    MB_PREDICTION_MODE m;
+  MB_PREDICTION_MODE m;
 
-    if (labelings[i] != which_label)
-      continue;
-
-    {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEWMV:
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-            seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case NEARESTMV:
-          this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
-          break;
-        case NEARMV:
-          this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int =
-                frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
-          break;
-        case ZEROMV:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
+  // the only time we should do costing for new motion vector or mode
+  // is when we are on a new label  (jbb May 08, 2007)
+  switch (m = this_mode) {
+    case NEWMV:
+      if (mbmi->second_ref_frame > 0) {
+        this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
+        this_second_mv->as_int =
+        seg_mvs[mbmi->second_ref_frame - 1].as_int;
       }
 
-      cost = vp9_cost_mv_ref(cpi, this_mode,
-                             mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
-
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
-
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
-        vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                   &mic->bmi[i], sizeof(mic->bmi[i]));
-        vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                    102, xd->allow_high_precision_mv);
+      if (mbmi->second_ref_frame > 0) {
+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                      mvjcost, mvcost, 102,
+                                      xd->allow_high_precision_mv);
       }
+      break;
+    case NEARESTMV:
+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARESTMV][mbmi->second_ref_frame].as_int;
+      break;
+    case NEARMV:
+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame].as_int;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARMV][mbmi->second_ref_frame].as_int;
+      break;
+    case ZEROMV:
+      this_mv->as_int = 0;
+      if (mbmi->second_ref_frame > 0)
+        this_second_mv->as_int = 0;
+      break;
+    default:
+      break;
+  }
+
+  cost = vp9_cost_mv_ref(cpi, this_mode,
+                         mbmi->mb_mode_context[mbmi->ref_frame]);
+
+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+
+  x->partition_info->bmi[i].mode = m;
+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+  if (mbmi->second_ref_frame > 0)
+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                 &x->partition_info->bmi[i],
+                 sizeof(x->partition_info->bmi[i]));
     }
   }
 
@@ -1882,6 +1874,160 @@
   return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
 }
 
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]) {
+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refs[2] = { mbmi->ref_frame,
+                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
+  int_mv ref_mv[2];
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  int ite;
+  // Prediction buffer from second frame.
+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d scaled_first_yv12;
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+
+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
+  if (scaled_ref_frame[0]) {
+    int i;
+
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_second_yv12[i] = xd->plane[i].pre[1];
+
+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                          mi_row, mi_col);
+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                          mi_row, mi_col);
+
+  scaled_first_yv12 = xd->plane[0].pre[0];
+
+  // Initialize mv using single prediction mode result.
+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+  // Allow joint search multiple times iteratively for each ref frame
+  // and break out the search loop if it couldn't find better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    int_mv tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get pred block from second frame.
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]],
+                              &xd->scale_factor[!id],
+                              pw, ph, 0,
+                              &xd->subpix);
+
+    // Compound motion search on first ref frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+    // Use mv result from single mode as mvp.
+    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+
+    tmp_mv.as_mv.col >>= 3;
+    tmp_mv.as_mv.row >>= 3;
+
+    // Small-range full-pixel motion search
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[block_size],
+                                       x->nmvjointcost, x->mvcost,
+                                       &ref_mv[id], second_pred,
+                                       pw, ph);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+
+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                             &ref_mv[id],
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &sse, second_pred,
+                                             pw, ph);
+    }
+
+    if (id)
+      xd->plane[0].pre[0] = scaled_first_yv12;
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_int =
+          xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  // restore the predictor
+  if (scaled_ref_frame[0]) {
+    int i;
+
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[1] = backup_second_yv12[i];
+  }
+
+  vpx_free(second_pred);
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int64_t txfm_cache[],
@@ -1924,145 +2070,9 @@
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-        if (cpi->sf.comp_inter_joint_serach) {
-          int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
-          int ite;
-          // Prediction buffer from second frame.
-          uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-
-          // Do joint motion search in compound mode to get more accurate mv.
-          struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-          struct buf_2d scaled_first_yv12;
-          int last_besterr[2] = {INT_MAX, INT_MAX};
-
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            // Swap out the reference frame for a version that's been scaled to
-            // match the resolution of the current frame, allowing the existing
-            // motion search code to be used without additional modifications.
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_yv12[i] = xd->plane[i].pre[0];
-
-            setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              backup_second_yv12[i] = xd->plane[i].pre[1];
-
-            setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
-                             NULL, NULL);
-          }
-          xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
-                                                  mi_row, mi_col);
-          xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
-                                                  mi_row, mi_col);
-
-          scaled_first_yv12 = xd->plane[0].pre[0];
-
-          // Initialize mv using single prediction mode result.
-          frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-          frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-          // Allow joint search multiple times iteratively for each ref frame
-          // and break out the search loop if it couldn't find better mv.
-          for (ite = 0; ite < 4; ite++) {
-            struct buf_2d ref_yv12[2];
-            int bestsme = INT_MAX;
-            int sadpb = x->sadperbit16;
-            int_mv tmp_mv;
-            int search_range = 3;
-
-            int tmp_col_min = x->mv_col_min;
-            int tmp_col_max = x->mv_col_max;
-            int tmp_row_min = x->mv_row_min;
-            int tmp_row_max = x->mv_row_max;
-            int id = ite % 2;
-
-            // Initialized here because of compiler problem in Visual Studio.
-            ref_yv12[0] = xd->plane[0].pre[0];
-            ref_yv12[1] = xd->plane[0].pre[1];
-
-            // Get pred block from second frame.
-            vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                      ref_yv12[!id].stride,
-                                      second_pred, pw,
-                                      &frame_mv[refs[!id]],
-                                      &xd->scale_factor[!id],
-                                      pw, ph, 0,
-                                      &xd->subpix);
-
-            // Compound motion search on first ref frame.
-            if (id)
-              xd->plane[0].pre[0] = ref_yv12[id];
-            vp9_clamp_mv_min_max(x, &ref_mv[id]);
-
-            // Use mv result from single mode as mvp.
-            tmp_mv.as_int = frame_mv[refs[id]].as_int;
-
-            tmp_mv.as_mv.col >>= 3;
-            tmp_mv.as_mv.row >>= 3;
-
-            // Small-range full-pixel motion search
-            bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                               search_range,
-                                               &cpi->fn_ptr[block_size],
-                                               x->nmvjointcost, x->mvcost,
-                                               &ref_mv[id], second_pred,
-                                               pw, ph);
-
-            x->mv_col_min = tmp_col_min;
-            x->mv_col_max = tmp_col_max;
-            x->mv_row_min = tmp_row_min;
-            x->mv_row_max = tmp_row_max;
-
-            if (bestsme < INT_MAX) {
-              int dis; /* TODO: use dis in distortion calculation later. */
-              unsigned int sse;
-
-              bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
-                                                     &ref_mv[id],
-                                                     x->errorperbit,
-                                                     &cpi->fn_ptr[block_size],
-                                                     x->nmvjointcost, x->mvcost,
-                                                     &dis, &sse, second_pred,
-                                                     pw, ph);
-            }
-
-            if (id)
-              xd->plane[0].pre[0] = scaled_first_yv12;
-
-            if (bestsme < last_besterr[id]) {
-              frame_mv[refs[id]].as_int =
-                  xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
-              last_besterr[id] = bestsme;
-            } else {
-              break;
-            }
-          }
-
-          // restore the predictor
-          if (scaled_ref_frame[0]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[0] = backup_yv12[i];
-          }
-
-          if (scaled_ref_frame[1]) {
-            int i;
-
-            for (i = 0; i < MAX_MB_PLANE; i++)
-              xd->plane[i].pre[1] = backup_second_yv12[i];
-          }
-
-          vpx_free(second_pred);
-        }
+        if (cpi->sf.comp_inter_joint_serach)
+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,
+                                  mi_row, mi_col, single_newmv);
 
         if (frame_mv[refs[0]].as_int == INVALID_MV ||
             frame_mv[refs[1]].as_int == INVALID_MV)
@@ -2684,8 +2694,6 @@
     if (this_mode == I4X4_PRED) {
       int rate;
 
-      // Note the rate value returned here includes the cost of coding
-      // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 08efc84..eb79de1 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -36,6 +36,9 @@
 extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -228,7 +231,7 @@
       if (!t->skip_eob_node)
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
-    token_cache[scan[c]] = token;
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++t;
   } while (c < eob && ++c < seg_eob);