Merge "Add missing calls to emms in the adaptive quantization code"

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f456abc..5abb9b1 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -113,8 +113,7 @@
       test_input_block[j] = src[j] - dst[j];
     }
 
-    const int pitch = 64;
-    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
     REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
 
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -150,9 +149,9 @@
     for (int j = 0; j < kNumCoeffs; ++j)
       input_block[j] = rnd.Rand8() - rnd.Rand8();
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+    const int stride = 32;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, stride);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
 
     if (version_ == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
@@ -188,9 +187,9 @@
       for (int j = 0; j < kNumCoeffs; ++j)
         input_extreme_block[j] = -255;
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+    const int stride = 32;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, stride);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, stride));
 
     // The minimum quant value is 4.
     for (int j = 0; j < kNumCoeffs; ++j) {

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 8ca4f5f..78e54e2 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -1062,7 +1062,7 @@
     if (cpi->common.frame_type == KEY_FRAME)
     {
         /* Reset to default counts/probabilities at key frames */
-        vp8_copy(cpi->coef_counts, default_coef_counts);
+        vp8_copy(cpi->mb.coef_counts, default_coef_counts);
     }
 
     if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5c8c03e..3111852 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -146,8 +146,6 @@
   TX_MODE tx_mode;
 
   int base_qindex;
-  int last_kf_gf_q;  /* Q used on the last GF or KF */
-
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 526be87..af96bb3 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -701,10 +701,10 @@
 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4 sse2
 
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct32x32 sse2
 
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride"
 specialize vp9_short_fdct32x32_rd sse2
 
 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 6bfd8f8..ef30404 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -76,7 +76,7 @@
 }
 
 
-const vp9_tree_index vp9_segment_tree[14] = {
+const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index f22239b..eb38c06 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -76,7 +76,7 @@
                     int segment_id,
                     SEG_LVL_FEATURES feature_id);
 
-extern const vp9_tree_index vp9_segment_tree[14];
+extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index b6555bc..00a2903 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -1315,8 +1315,7 @@
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1324,7 +1323,7 @@
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -1344,8 +1343,7 @@
 // Note that although we use dct_32_round in dct32_1d computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
-  int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
   int i, j;
   int output[32 * 32];
 
@@ -1353,7 +1351,7 @@
   for (i = 0; i < 32; ++i) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * shortpitch + i] * 4;
+      temp_in[j] = input[j * stride + i] * 4;
     dct32_1d(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index c1e1a0d..2b5451b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -365,9 +365,9 @@
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -532,9 +532,9 @@
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
       if (x->use_lp32x32fdct)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 4);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 2f147a0..ea4c9e8 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -61,6 +61,7 @@
     best_err = cpi->find_fractional_mv_step(
         x,
         &dst_mv->as_mv, &ref_mv->as_mv,
+        xd->allow_high_precision_mv,
         x->errorperbit, &v_fn_ptr,
         0, cpi->sf.subpel_iters_per_step, NULL, NULL,
         & distortion, &sse);

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 561c725..a52f5b1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -275,6 +275,7 @@
 
 int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
                                       MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -348,8 +349,7 @@
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -373,6 +373,7 @@
 
 int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
                                  int forced_stop,
@@ -436,8 +437,7 @@
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -465,6 +465,7 @@
 
 int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
                                            MV *bestmv, const MV *ref_mv,
+                                           int allow_hp,
                                            int error_per_bit,
                                            const vp9_variance_fn_ptr_t *vfp,
                                            int forced_stop,
@@ -544,8 +545,7 @@
     }
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     while (eighthiters--) {
       FIRST_LEVEL_CHECKS;
@@ -568,6 +568,7 @@
 
 int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
                                       MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
                                       int error_per_bit,
                                       const vp9_variance_fn_ptr_t *vfp,
                                       int forced_stop,
@@ -642,8 +643,7 @@
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
-      forced_stop == 0) {
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 77c157c..bcab679 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -74,6 +74,7 @@
 typedef int (fractional_mv_step_fp) (
     MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
@@ -88,6 +89,7 @@
 typedef int (fractional_mv_step_comp_fp) (
     MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
+    int allow_hp,
     int error_per_bit,
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 2b1caf4..54b3d43 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -3386,11 +3386,6 @@
 #if 0
   output_frame_level_debug_stats(cpi);
 #endif
-  // If this was a kf or Gf note the Q
-  if ((cm->frame_type == KEY_FRAME)
-      || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    cm->last_kf_gf_q = cm->base_qindex;
-
   if (cpi->refresh_golden_frame == 1)
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   else

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f192968..7add494 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1870,6 +1870,7 @@
             cpi->find_fractional_mv_step(x,
                                          &mode_mv[NEWMV].as_mv,
                                          &bsi->ref_mv->as_mv,
+                                         x->e_mbd.allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
                                          0, cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
@@ -2450,6 +2451,7 @@
     int dis;  /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+                                 xd->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[block_size],
                                  0, cpi->sf.subpel_iters_per_step,
@@ -2585,6 +2587,7 @@
       bestsme = cpi->find_fractional_mv_step_comp(
           x, &tmp_mv.as_mv,
           &ref_mv[id].as_mv,
+          xd->allow_high_precision_mv,
           x->errorperbit,
           &cpi->fn_ptr[block_size],
           0, cpi->sf.subpel_iters_per_step,

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6ff0de4..5cf8143 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -166,6 +166,7 @@
     // Ignore mv costing by sending NULL pointer instead of cost array
     bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
                                            &best_ref_mv1.as_mv,
+                                           xd->allow_high_precision_mv,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
                                            0, cpi->sf.subpel_iters_per_step,

diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 11eec7f..de47a5b 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c

@@ -30,11 +30,11 @@
 #endif
 
 void FDCT32x32_2D(int16_t *input,
-                  int16_t *output_org, int pitch) {
+                  int16_t *output_org, int stride) {
   // Calculate pre-multiplied strides
-  const int str1 = pitch >> 1;
-  const int str2 = pitch;
-  const int str3 = pitch + str1;
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
   // We need an intermediate buffer between passes.
   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
   // Constants