Merge "Some rate control adjustments to control overshoot"

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index dea29c0..d5cd46c 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c

@@ -677,7 +677,7 @@
       vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
       vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
       vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-      vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+      vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
       vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
       if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) {
         die_codec(&codec, "Failed to set SVC");
@@ -712,6 +712,8 @@
                         layer_id.temporal_layer_id);
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
+    if (layering_mode == 0)
+      flags = 0;
     frame_avail = vpx_img_read(&raw, infile);
     if (frame_avail)
       ++rc.layer_input_frames[layer_id.temporal_layer_id];

diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 7094a01..3cd9f44 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c

@@ -50,39 +50,25 @@
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const int step = 1 << (tx_size << 1);
-  int i;
+  int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    int r, c;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
-
-    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-    // it to 4x4 block sizes.
-    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-    i = 0;
-    // Unlike the normal case - in here we have to keep track of the
-    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
-        i += step;
-      }
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      if (c < max_blocks_wide)
+        visit(plane, i, plane_bsize, tx_size, arg);
+      i += step;
     }
-  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
-      visit(plane, i, plane_bsize, tx_size, arg);
   }
 }
 

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 048a0ed..4df1b58 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h

@@ -65,7 +65,7 @@
 void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
-void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *cpi);
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
 
 // Check if we should not update golden reference, based on past refresh stats.
 void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5c841cf..cf8ac0a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -514,12 +514,6 @@
   }
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-#define GLOBAL_MOTION 0
-#else
-#define GLOBAL_MOTION 1
-#endif
-
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
@@ -564,7 +558,7 @@
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-#if GLOBAL_MOTION
+
     const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
     BLOCK_SIZE bsize;
@@ -576,9 +570,9 @@
       bsize = BLOCK_64X32;
     else
       bsize = BLOCK_32X32;
-#endif
+
     assert(yv12 != NULL);
-#if GLOBAL_MOTION
+
     if (yv12_g && yv12_g != yv12) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -589,7 +583,7 @@
     } else {
       y_sad_g = UINT_MAX;
     }
-#endif
+
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
@@ -597,7 +591,7 @@
     mbmi->sb_type = BLOCK_64X64;
     mbmi->mv[0].as_int = 0;
     mbmi->interp_filter = BILINEAR;
-#if GLOBAL_MOTION
+
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
     if (y_sad_g < y_sad) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -608,29 +602,21 @@
     } else {
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
-#endif
 
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
-#if GLOBAL_MOTION
       const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd);
-#endif
+
       if (bs == BLOCK_INVALID)
-        uv_sad = INT_MAX;
+        uv_sad = UINT_MAX;
       else
         uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
                                      pd->dst.buf, pd->dst.stride);
 
-#if GLOBAL_MOTION
-      x->color_sensitivity[i - 1] = uv_sad * 4 > y_sad;
-#else
-      x->color_sensitivity[i - 1] = (uv_sad > 512);
-#endif
+      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
     d = xd->plane[0].dst.buf;

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 0730467..12882e4 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -1807,6 +1807,13 @@
   int best_sad;
   MV this_mv;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  tmp_mv->row = 0;
+  tmp_mv->col = 0;
+  return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                xd->plane[0].pre[0].buf, ref_stride);
+#endif
+
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index dedec5f..f1b19e0 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -137,10 +137,6 @@
   int cost_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
-  if (cpi->common.show_frame &&
-      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
-    return rv;
-
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
@@ -784,15 +780,43 @@
         continue;
 
       if (this_mode == NEWMV) {
-        if (ref_frame > LAST_FRAME)
-          continue;
         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
             best_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                    &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rdc.rdcost))
+
+        if (ref_frame > LAST_FRAME) {
+          int tmp_sad;
+          int dis, cost_list[5];
+
+          if (bsize < BLOCK_16X16)
+            continue;
+
+          tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+          if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
+            continue;
+
+          frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+          rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                                    &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+          frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+          frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+          cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+                                       &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                       cpi->common.allow_high_precision_mv,
+                                       x->errorperbit,
+                                       &cpi->fn_ptr[bsize],
+                                       cpi->sf.mv.subpel_force_stop,
+                                       cpi->sf.mv.subpel_iters_per_step,
+                                       cond_cost_list(cpi, cost_list),
+                                       x->nmvjointcost, x->mvcost, &dis,
+                                       &x->pred_sse[ref_frame], NULL, 0, 0);
+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                           &frame_mv[NEWMV][ref_frame],
+                                           &rate_mv, best_rdc.rdcost)) {
           continue;
+        }
       }
 
       if (this_mode != NEARESTMV &&
@@ -817,7 +841,7 @@
       }
 
       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search &&
+          pred_filter_search && (ref_frame == LAST_FRAME) &&
           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
            (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
         int pf_rate[3];
@@ -1052,6 +1076,7 @@
         mode_idx[INTRA_FRAME][mbmi->mode];
     PREDICTION_MODE this_mode;
     for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+      if (best_ref_frame != ref_frame) continue;
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         THR_MODES thr_mode_idx = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
         int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index c6273f1..7783f7b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -377,7 +377,7 @@
       rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
   rcf *= rcf_mult[rc->frame_size_selector];
-  return rcf > MAX_BPB_FACTOR ? MAX_BPB_FACTOR : rcf;
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
@@ -386,6 +386,8 @@
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= rcf_mult[cpi->rc.frame_size_selector];
 
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
   if (cpi->common.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {

diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index f499499..618b5f7 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -61,7 +61,7 @@
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
   __m128i t0, t1;
@@ -69,14 +69,14 @@
   ref += ref_stride;
 
   for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
     s1 = _mm_adds_epu16(s1, t1);
     ref += ref_stride;
 
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     t0 = _mm_unpacklo_epi8(src_line, zero);
     t1 = _mm_unpackhi_epi8(src_line, zero);
     s0 = _mm_adds_epu16(s0, t0);
@@ -84,7 +84,7 @@
     ref += ref_stride;
   }
 
-  src_line = _mm_load_si128((const __m128i *)ref);
+  src_line = _mm_loadu_si128((const __m128i *)ref);
   t0 = _mm_unpacklo_epi8(src_line, zero);
   t1 = _mm_unpackhi_epi8(src_line, zero);
   s0 = _mm_adds_epu16(s0, t0);
@@ -101,9 +101,9 @@
     s1 = _mm_srai_epi16(s1, 3);
   }
 
-  _mm_store_si128((__m128i *)hbuf, s0);
+  _mm_storeu_si128((__m128i *)hbuf, s0);
   hbuf += 8;
-  _mm_store_si128((__m128i *)hbuf, s1);
+  _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {