Merge "Use lookup table to simplify logic"

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 3d61d40..b990bf8 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -395,8 +395,7 @@
       for (int j = 0; j < kNumCoeffs; ++j)
         coeff[j] = round(out_r[j]);
 
-      const int pitch = 32;
-      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];
@@ -421,7 +420,7 @@
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_  = GET_PARAM(2);
-    pitch_    = 32;
+    pitch_    = 16;
     fwd_txfm_ref = fdct16x16_ref;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -431,7 +430,7 @@
     fwd_txfm_(in, out, stride);
   }
   void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride >> 1);
+    inv_txfm_(out, dst, stride);
   }
 
   fdct_t fwd_txfm_;

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f456abc..5abb9b1 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -113,8 +113,7 @@
       test_input_block[j] = src[j] - dst[j];
     }
 
-    const int pitch = 64;
-    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
     REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
 
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -150,9 +149,9 @@
     for (int j = 0; j < kNumCoeffs; ++j)
       input_block[j] = rnd.Rand8() - rnd.Rand8();
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+    const int stride = 32;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, stride);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
 
     if (version_ == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
@@ -188,9 +187,9 @@
       for (int j = 0; j < kNumCoeffs; ++j)
         input_extreme_block[j] = -255;
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+    const int stride = 32;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, stride);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, stride));
 
     // The minimum quant value is 4.
     for (int j = 0; j < kNumCoeffs; ++j) {

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 8ca4f5f..78e54e2 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -1062,7 +1062,7 @@
     if (cpi->common.frame_type == KEY_FRAME)
     {
         /* Reset to default counts/probabilities at key frames */
-        vp8_copy(cpi->coef_counts, default_coef_counts);
+        vp8_copy(cpi->mb.coef_counts, default_coef_counts);
     }
 
     if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 0a441bd..36e7e83 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -234,31 +234,6 @@
   int q_index;
 } MACROBLOCKD;
 
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
-  switch (subsize) {
-    case BLOCK_64X64:
-    case BLOCK_64X32:
-    case BLOCK_32X64:
-    case BLOCK_32X32:
-      return &xd->sb_index;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
-      return &xd->mb_index;
-    case BLOCK_16X8:
-    case BLOCK_8X16:
-    case BLOCK_8X8:
-      return &xd->b_index;
-    case BLOCK_8X4:
-    case BLOCK_4X8:
-    case BLOCK_4X4:
-      return &xd->ab_index;
-    default:
-      assert(0);
-      return NULL;
-  }
-}
-
 static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
                                             BLOCK_SIZE sb_size) {
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5c8c03e..3111852 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -146,8 +146,6 @@
   TX_MODE tx_mode;
 
   int base_qindex;
-  int last_kf_gf_q;  /* Q used on the last GF or KF */
-
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 526be87..6fa9e22 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -701,13 +701,13 @@
 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4 sse2
 
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct32x32 sse2
 
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct32x32_rd sse2
 
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct16x16 sse2
 
 prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 6bfd8f8..ef30404 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -76,7 +76,7 @@
 }
 
 
-const vp9_tree_index vp9_segment_tree[14] = {
+const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index f22239b..eb38c06 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -76,7 +76,7 @@
                     int segment_id,
                     SEG_LVL_FEATURES feature_id);
 
-extern const vp9_tree_index vp9_segment_tree[14];
+extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 5bfe7b7..16e07a5 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -421,11 +421,12 @@
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
-static INLINE void assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
                              int_mv mv[2], int_mv best_mv[2],
                              int_mv nearest_mv[2], int_mv near_mv[2],
                              int is_compound, int allow_hp, vp9_reader *r) {
   int i;
+  int ret = 1;
 
   switch (mode) {
     case NEWMV:
@@ -434,6 +435,10 @@
        if (is_compound)
          read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
                  &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+       for (i = 0; i < 1 + is_compound; ++i) {
+         ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
+         ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+       }
        break;
     case NEARESTMV:
       mv[0].as_int = nearest_mv[0].as_int;
@@ -451,13 +456,9 @@
         mv[1].as_int = 0;
       break;
     default:
-      assert(!"Invalid inter mode value.");
+      return 0;
   }
-
-  for (i = 0; i < 1 + is_compound; ++i) {
-    assert(mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW);
-    assert(mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW);
-  }
+  return ret;
 }
 
 static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
@@ -553,8 +554,12 @@
                                           mi_row, mi_col);
         }
 
-        assign_mv(cm, b_mode, block, best, nearest, nearmv,
-                  is_compound, allow_hp, r);
+        if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+                       is_compound, allow_hp, r)) {
+          xd->corrupted |= 1;
+          break;
+        };
+
 
         mi->bmi[j].as_mv[0].as_int = block[0].as_int;
         if (is_compound)
@@ -572,8 +577,9 @@
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    assign_mv(cm, mbmi->mode, mbmi->mv, best, nearest, nearmv,
-              is_compound, allow_hp, r);
+    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
+                                best, nearest, nearmv,
+                                is_compound, allow_hp, r);
   }
 }
 

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b914de7..ec310f4 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -224,14 +224,14 @@
 
 static void decode_modes_b(VP9D_COMP *pbi, int tile_col,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize) {
+                           vp9_reader *r, BLOCK_SIZE bsize, int index) {
   MACROBLOCKD *const xd = &pbi->mb;
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
   int eobtotal;
 
   if (less8x8)
-    if (xd->ab_index > 0)
+    if (index > 0)
       return;
 
   set_offsets(pbi, bsize, tile_col, mi_row, mi_col);
@@ -271,9 +271,10 @@
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
+
 static void decode_modes_sb(VP9D_COMP *pbi, int tile_col,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize) {
+                            vp9_reader* r, BLOCK_SIZE bsize, int index) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
@@ -284,7 +285,7 @@
     return;
 
   if (bsize < BLOCK_8X8) {
-    if (xd->ab_index != 0)
+    if (index > 0)
       return;
   } else {
     int pl;
@@ -306,31 +307,27 @@
   }
 
   subsize = get_subsize(bsize, partition);
-  *get_sb_index(xd, subsize) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
-      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
+      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
       break;
     case PARTITION_HORZ:
-      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
-      *get_sb_index(xd, subsize) = 1;
+      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
       if (mi_row + hbs < cm->mi_rows)
-        decode_modes_b(pbi, tile_col, mi_row + hbs, mi_col, r, subsize);
+        decode_modes_b(pbi, tile_col, mi_row + hbs, mi_col, r, subsize, 1);
       break;
     case PARTITION_VERT:
-      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
-      *get_sb_index(xd, subsize) = 1;
+      decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
       if (mi_col + hbs < cm->mi_cols)
-        decode_modes_b(pbi, tile_col, mi_row, mi_col + hbs, r, subsize);
+        decode_modes_b(pbi, tile_col, mi_row, mi_col + hbs, r, subsize, 1);
       break;
     case PARTITION_SPLIT: {
       int n;
       for (n = 0; n < 4; n++) {
         const int j = n >> 1, i = n & 1;
-        *get_sb_index(xd, subsize) = n;
         decode_modes_sb(pbi, tile_col, mi_row + j * hbs, mi_col + i * hbs,
-                        r, subsize);
+                        r, subsize, n);
       }
     } break;
     default:
@@ -611,7 +608,7 @@
     vp9_zero(cm->left_seg_context);
     for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      decode_modes_sb(pbi, tile_col, mi_row, mi_col, r, BLOCK_64X64);
+      decode_modes_sb(pbi, tile_col, mi_row, mi_col, r, BLOCK_64X64, 0);
 
     if (pbi->do_loopfilter_inline) {
       // delay the loopfilter by 1 macroblock row.
@@ -802,6 +799,7 @@
                                        struct vp9_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  size_t sz;
   int i;
 
   cm->last_frame_type = cm->frame_type;
@@ -909,8 +907,9 @@
   setup_segmentation(&cm->seg, rb);
 
   setup_tile_info(cm, rb);
+  sz = vp9_rb_read_literal(rb, 16);
 
-  return vp9_rb_read_literal(rb, 16);
+  return sz > 0 ? sz : -1;
 }
 
 static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index bfac5a7..ed795f0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -565,13 +565,13 @@
 
 static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mi_row, int mi_col) {
+                          int mi_row, int mi_col, int index) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO *m = mi_8x8[0];
 
   if (m->mbmi.sb_type < BLOCK_8X8)
-    if (xd->ab_index > 0)
+    if (index > 0)
       return;
 
   xd->mi_8x8 = mi_8x8;
@@ -597,7 +597,8 @@
 
 static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
                            TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           int index) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
@@ -613,11 +614,10 @@
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
 
-  if (bsize < BLOCK_8X8)
-    if (xd->ab_index > 0)
+  if (bsize < BLOCK_8X8) {
+    if (index > 0)
       return;
-
-  if (bsize >= BLOCK_8X8) {
+  } else {
     int pl;
     const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
                                          mi_row, mi_col);
@@ -634,31 +634,28 @@
   }
 
   subsize = get_subsize(bsize, partition);
-  *(get_sb_index(xd, subsize)) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
-      *(get_sb_index(xd, subsize)) = 1;
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_row + bs) < cm->mi_rows)
         write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
-                      mi_col);
+                      mi_col, 1);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
-      *(get_sb_index(xd, subsize)) = 1;
+      write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
       if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+        write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs,
+                      1);
       break;
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
-        int j = n >> 1, i = n & 0x01;
-        *(get_sb_index(xd, subsize)) = n;
+        const int j = n >> 1, i = n & 1;
         write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
-                       mi_row + j * bs, mi_col + i * bs, subsize);
+                       mi_row + j * bs, mi_col + i * bs, subsize, n);
       }
       break;
     default:
@@ -690,7 +687,7 @@
     for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
          mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
       write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64);
+                     BLOCK_64X64, 0);
     }
   }
 }

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index b6555bc..23c652d 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -302,14 +302,13 @@
   }
 }
 
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
   int16_t intermediate[256];
@@ -1315,8 +1314,7 @@
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1324,7 +1322,7 @@
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -1344,8 +1342,7 @@
 // Note that although we use dct_32_round in dct32_1d computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1353,7 +1350,7 @@
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 081f331..6e8e1d1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -50,6 +50,31 @@
 int enc_debug = 0;
 #endif
 
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+  switch (subsize) {
+    case BLOCK_64X64:
+    case BLOCK_64X32:
+    case BLOCK_32X64:
+    case BLOCK_32X32:
+      return &xd->sb_index;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      return &xd->mb_index;
+    case BLOCK_16X8:
+    case BLOCK_8X16:
+    case BLOCK_8X8:
+      return &xd->b_index;
+    case BLOCK_8X4:
+    case BLOCK_4X8:
+    case BLOCK_4X4:
+      return &xd->ab_index;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize);
 
@@ -554,7 +579,10 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int orig_rdmult = x->rdmult;
-  double rdmult_ratio = 1.0;
+  double rdmult_ratio;
+
+  vp9_clear_system_state();  // __asm emms;
+  rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -593,7 +621,10 @@
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
-  x->rdmult = round(x->rdmult * rdmult_ratio);
+  if (cpi->sf.variance_adaptive_quantization) {
+    vp9_clear_system_state();  // __asm emms;
+    x->rdmult = round(x->rdmult * rdmult_ratio);
+  }
 
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
@@ -609,9 +640,13 @@
                                     totaldist, bsize, ctx, best_rd);
   }
 
-  x->rdmult = orig_rdmult;
-  if (*totalrate != INT_MAX)
-    *totalrate = round(*totalrate * rdmult_ratio);
+  if (cpi->sf.variance_adaptive_quantization) {
+    x->rdmult = orig_rdmult;
+    if (*totalrate != INT_MAX) {
+      vp9_clear_system_state();  // __asm emms;
+      *totalrate = round(*totalrate * rdmult_ratio);
+    }
+  }
 }
 
 static void update_stats(VP9_COMP *cpi) {

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index c1e1a0d..13d8aa8 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -365,9 +365,9 @@
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -379,7 +379,7 @@
       xoff = 16 * (block & twmask);
       yoff = 16 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_short_fdct16x16(src_diff, coeff, bw * 8);
+      vp9_short_fdct16x16(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -532,9 +532,9 @@
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -559,7 +559,7 @@
       if (tx_type != DCT_DCT)
         vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
       else
-        vp9_short_fdct16x16(src_diff, coeff, bw * 8);
+        vp9_short_fdct16x16(src_diff, coeff, bw * 4);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7157451..caf4162 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -554,7 +554,10 @@
       int this_error;
       int gf_motion_error = INT_MAX;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      double error_weight = 1.0;
+      double error_weight;
+
+      vp9_clear_system_state();  // __asm emms;
+      error_weight = 1.0;  // avoid uninitialized warnings
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -587,7 +590,11 @@
       }
 
       // do intra 16x16 prediction
-      this_error = error_weight * vp9_encode_intra(x, use_dc_pred);
+      this_error = vp9_encode_intra(x, use_dc_pred);
+      if (cpi->sf.variance_adaptive_quantization) {
+        vp9_clear_system_state();  // __asm emms;
+        this_error *= error_weight;
+      }
 
       // intrapenalty below deals with situations where the intra and inter
       // error scores are very low (eg a plain black frame).
@@ -622,7 +629,10 @@
         first_pass_motion_search(cpi, x, &best_ref_mv,
                                  &mv.as_mv, lst_yv12,
                                  &motion_error, recon_yoffset);
-        motion_error *= error_weight;
+        if (cpi->sf.variance_adaptive_quantization) {
+          vp9_clear_system_state();  // __asm emms;
+          motion_error *= error_weight;
+        }
 
         // If the current best reference mv is not centered on 0,0 then do a 0,0
         // based search as well.
@@ -630,7 +640,10 @@
           tmp_err = INT_MAX;
           first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
                                    lst_yv12, &tmp_err, recon_yoffset);
-          tmp_err *= error_weight;
+          if (cpi->sf.variance_adaptive_quantization) {
+            vp9_clear_system_state();  // __asm emms;
+            tmp_err *= error_weight;
+          }
 
           if (tmp_err < motion_error) {
             motion_error = tmp_err;
@@ -647,7 +660,10 @@
           first_pass_motion_search(cpi, x, &zero_ref_mv,
                                    &tmp_mv.as_mv, gld_yv12,
                                    &gf_motion_error, recon_yoffset);
-          gf_motion_error *= error_weight;
+          if (cpi->sf.variance_adaptive_quantization) {
+            vp9_clear_system_state();  // __asm emms;
+            gf_motion_error *= error_weight;
+          }
 
           if ((gf_motion_error < motion_error) &&
               (gf_motion_error < this_error)) {

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 2f147a0..ea4c9e8 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -61,6 +61,7 @@
     best_err = cpi->find_fractional_mv_step(
         x,
         &dst_mv->as_mv, &ref_mv->as_mv,
+        xd->allow_high_precision_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 561c725..a52f5b1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -275,6 +275,7 @@
 
 int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
                                       MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -348,8 +349,7 @@
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -373,6 +373,7 @@
 
 int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
@@ -436,8 +437,7 @@
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -465,6 +465,7 @@
 
 int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
                                            MV *bestmv, const MV *ref_mv,
+                                           int allow_hp,
                                            int error_per_bit,
                                            const vp9_variance_fn_ptr_t *vfp,
                                            int forced_stop,
@@ -544,8 +545,7 @@
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -568,6 +568,7 @@
 
 int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
                                       MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -642,8 +643,7 @@
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 77c157c..bcab679 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -74,6 +74,7 @@
 typedef int (fractional_mv_step_fp) (
     MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
@@ -88,6 +89,7 @@
 typedef int (fractional_mv_step_comp_fp) (
     MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 2b1caf4..54b3d43 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -3386,11 +3386,6 @@
 #if 0
   output_frame_level_debug_stats(cpi);
 #endif
-  // If this was a kf or Gf note the Q
-  if ((cm->frame_type == KEY_FRAME)
-      || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    cm->last_kf_gf_q = cm->base_qindex;
-
   if (cpi->refresh_golden_frame == 1)
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   else

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3eb14c8..30cdb3f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1860,6 +1860,7 @@
             cpi->find_fractional_mv_step(x,
                                          &mode_mv[NEWMV].as_mv,
                                          &bsi->ref_mv->as_mv,
+                                         x->e_mbd.allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
                                          0, cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
@@ -2440,6 +2441,7 @@
     int dis;  /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                                 xd->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[block_size],
                                  0, cpi->sf.subpel_iters_per_step,
@@ -2575,6 +2577,7 @@
       bestsme = cpi->find_fractional_mv_step_comp(
           x, &tmp_mv.as_mv,
           &ref_mv[id].as_mv,
+          xd->allow_high_precision_mv,
           x->errorperbit,
           &cpi->fn_ptr[block_size],
           0, cpi->sf.subpel_iters_per_step,

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6ff0de4..5cf8143 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -166,6 +166,7 @@
     // Ignore mv costing by sending NULL pointer instead of cost array
     bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
                                            &best_ref_mv1.as_mv,
+                                           xd->allow_high_precision_mv,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
                                            0, cpi->sf.subpel_iters_per_step,

diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 3d3b4b0..3179ae3 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c

@@ -37,25 +37,36 @@
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
+
   return SEGMENT_ID(energy);
 }
 
 double vp9_vaq_rdmult_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
+
+  vp9_clear_system_state();  // __asm emms;
+
   return RDMULT_RATIO(energy);
 }
 
 double vp9_vaq_inv_q_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
+
+  vp9_clear_system_state();  // __asm emms;
+
   return Q_RATIO(-energy);
 }
 
 void vp9_vaq_init() {
   int i;
-  double base_ratio = 1.8;
+  double base_ratio;
 
   assert(ENERGY_SPAN <= MAX_SEGMENTS);
 
+  vp9_clear_system_state();  // __asm emms;
+
+  base_ratio = 1.8;
+
   for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
     Q_RATIO(i) = pow(base_ratio, i/3.0);
   }
@@ -74,6 +85,8 @@
 
   seg->abs_delta = SEGMENT_DELTADATA;
 
+  vp9_clear_system_state();  // __asm emms;
+
   for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
     int qindex_delta, segment_rdmult;
 
@@ -89,6 +102,7 @@
 
     segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
                                          cm->y_dc_delta_q);
+
     RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
   }
 }
@@ -120,9 +134,14 @@
 }
 
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  unsigned int var = block_variance(cpi, x, bs);
+
+  vp9_clear_system_state();  // __asm emms;
+
   // if (var <= 1000)
   //   return 0;
-  unsigned int var = block_variance(cpi, x, bs);
-  double energy = 0.9*(logf(var + 1) - 10.0);
+
+  energy = 0.9*(logf(var + 1) - 10.0);
   return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
 }

diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 11eec7f..de47a5b 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c

@@ -30,11 +30,11 @@
 #endif
 
 void FDCT32x32_2D(int16_t *input,
-                  int16_t *output_org, int pitch) {
+                  int16_t *output_org, int stride) {
   // Calculate pre-multiplied strides
-  const int str1 = pitch >> 1;
-  const int str2 = pitch;
-  const int str3 = pitch + str1;
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
   // Constants

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 5e1e5ed..457883f 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -1056,14 +1056,13 @@
   write_buffer_8x8(output, in, 8);
 }
 
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
   // as the first pass results are transposed, we tranpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
-  const int stride = pitch >> 1;
   int pass;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);