Merge "Unified approach for backward probability update."
diff --git a/configure b/configure
index 621161c..729c986 100755
--- a/configure
+++ b/configure
@@ -249,7 +249,6 @@
     unistd_h
 "
 EXPERIMENT_LIST="
-    oneshotq
     multiple_arf
     non420
     alpha
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 63b889d..1fd9e97 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -41,6 +41,7 @@
   VP9_COMMON *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
 } TileWorkerData;
 
 static int read_be32(const uint8_t *p) {
@@ -297,6 +298,7 @@
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
+  unsigned char* token_cache;
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -326,7 +328,7 @@
 
   if (!mi->mbmi.skip_coeff) {
     vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
-                            args->r);
+                            args->r, args->token_cache);
     inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
   }
 }
@@ -336,6 +338,7 @@
   MACROBLOCKD *xd;
   vp9_reader *r;
   int *eobtotal;
+  unsigned char* token_cache;
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -346,7 +349,8 @@
   MACROBLOCKD *const xd = args->xd;
 
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
-                                             plane_bsize, tx_size, args->r);
+                                             plane_bsize, tx_size,
+                                             args->r, args->token_cache);
   inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
 }
 
@@ -398,7 +402,8 @@
 static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize) {
+                           vp9_reader *r, BLOCK_SIZE bsize,
+                           unsigned char *token_cache) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -420,7 +425,7 @@
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = { cm, xd, r };
+    struct intra_args arg = { cm, xd, r, token_cache };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
   } else {
@@ -438,7 +443,7 @@
     // Reconstruction
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
-      struct inter_args arg = { cm, xd, r, &eobtotal };
+      struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip_coeff = 1;  // skip loopfilter
@@ -477,7 +482,8 @@
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize) {
+                            vp9_reader* r, BLOCK_SIZE bsize,
+                            unsigned char *token_cache) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -488,27 +494,33 @@
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         if (mi_row + hbs < cm->mi_rows)
-          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                         token_cache);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         if (mi_col + hbs < cm->mi_cols)
-          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                         token_cache);
         break;
       case PARTITION_SPLIT:
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
+                        token_cache);
         break;
       default:
         assert(!"Invalid partition type");
@@ -791,7 +803,8 @@
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
+                      pbi->token_cache);
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -935,7 +948,7 @@
 }
 
 static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+  TileWorkerData *tile_data = (TileWorkerData*)arg1;
   const TileInfo *const tile = (TileInfo*)arg2;
   int mi_row, mi_col;
 
@@ -944,9 +957,11 @@
     vp9_zero(tile_data->xd.left_context);
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
+         mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
-                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
+                      tile_data->token_cache);
+    }
   }
   return !tile_data->xd.corrupted;
 }
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 6ecce28..70d0d74 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -70,7 +70,6 @@
                      DCT_EOB_MODEL_TOKEN : TWO_TOKEN) :  \
                     token];                              \
     }                                                    \
-    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
   } while (0)
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
@@ -78,6 +77,7 @@
     qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
                             dq[c > 0] / (1 + (tx_size == TX_32X32)); \
     INCREMENT_COUNT(token);                              \
+    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
     c++;                                                 \
     continue;                                            \
   }
@@ -91,7 +91,8 @@
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq, int pt) {
+                        TX_SIZE tx_size, const int16_t *dq, int pt,
+                        uint8_t *token_cache) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -104,7 +105,6 @@
   vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
   const int16_t *scan, *nb;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  uint8_t token_cache[1024];
   get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
   while (1) {
@@ -131,6 +131,7 @@
 
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
+      token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
       ++c;
       goto SKIP_START;
     }
@@ -212,7 +213,8 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r) {
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
@@ -223,7 +225,7 @@
 
   eob = decode_coefs(cm, xd, r, block,
                      pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
-                     tx_size, pd->dequant, pt);
+                     tx_size, pd->dequant, pt, token_cache);
 
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 94dd8e4..04939ea 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -17,6 +17,7 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r);
+                            TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 83ea967..7c4c9db 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -49,6 +49,8 @@
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   PARTITION_CONTEXT *above_seg_context;
+
+  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c83954e..c3dbc86 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2169,17 +2169,14 @@
       cpi->ni_av_qi = tmp_q;
       cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
-#ifndef ONE_SHOT_Q_ESTIMATE
       // Limit the maxq value returned subsequently.
       // This increases the risk of overspend or underspend if the initial
       // estimate for the clip is bad, but helps prevent excessive
       // variation in Q, especially near the end of a clip
       // where for example a small overspend may cause Q to crash
       adjust_maxq_qrange(cpi);
-#endif
     }
 
-#ifndef ONE_SHOT_Q_ESTIMATE
     // The last few frames of a clip almost always have to few or too many
     // bits and for the sake of over exact rate control we dont want to make
     // radical adjustments to the allowed quantizer range just to use up a
@@ -2202,7 +2199,6 @@
       cpi->active_worst_quality =
           adjust_active_maxq(cpi->active_worst_quality, tmp_q);
     }
-#endif
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0b05cf2..1d3170a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1180,7 +1180,6 @@
   int i;
 
   cpi->oxcf = *oxcf;
-  cpi->goldfreq = 7;
 
   cm->version = oxcf->version;
 
@@ -2851,19 +2850,11 @@
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       cpi->active_best_quality = cpi->cq_target_quality;
     } else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
-      cpi->active_best_quality = q;
-#else
-      cpi->active_best_quality = inter_minq[q];
-#endif
-#else
       cpi->active_best_quality = inter_minq[q];
       // 1-pass: for now, use the average Q for the active_best, if its lower
       // than active_worst.
       if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
         cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
-#endif
 
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 44f1e26..9429c7f 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -29,11 +29,6 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
-#endif
 #define DISABLE_RC_LONG_TERM_MEM 0
 
 // #define MODE_TEST_HIT_STATS
@@ -506,14 +501,9 @@
   int decimation_count;
 
   // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
   int speed;
-  unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
-  int interquantizer;
-  int goldfreq;
   int auto_worst_q;
   int cpu_used;
   int pass;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 05928e0..993919e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1092,7 +1092,7 @@
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
-        vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+        vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
 
         ratey += cost_coeffs(x, 0, block,
                              tempa + idx, templ + idy, TX_4X4, scan, nb);
@@ -1559,7 +1559,7 @@
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
+      vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
                                  get_iscan_4x4(DCT_DCT));
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
@@ -1871,12 +1871,14 @@
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+        if (has_second_rf) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
+        }
 
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -2660,6 +2662,12 @@
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
@@ -2678,9 +2686,6 @@
                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV)
-        return INT64_MAX;
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;