Merge "Simplify the inner loop of duplicate_mode_info_in_sb."
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 821a9ec..d6b1b52 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -957,7 +957,7 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.17)
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
       << " The datarate for the file is lower than the target by too much!";
   EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
 }
@@ -1035,7 +1035,7 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.17)
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
       << " The datarate for the file is lower than the target by too much!";
   EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
 }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 14ef88f..6d6915c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -410,6 +410,9 @@
   memset(&cpi->svc.scaled_frames[0], 0,
          MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
 
+  vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+  memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
   vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
   memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 
@@ -3357,11 +3360,22 @@
   vpx_clear_system_state();
 
   set_frame_size(cpi);
-  cpi->Source = vp9_scale_if_required(cm,
-                                      cpi->un_scaled_source,
-                                      &cpi->scaled_source,
-                                      (cpi->oxcf.pass == 0));
 
+  if (is_one_pass_cbr_svc(cpi) &&
+      cpi->un_scaled_source->y_width == cm->width << 2 &&
+      cpi->un_scaled_source->y_height == cm->height << 2 &&
+      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
+      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+    cpi->Source = vp9_svc_twostage_scale(cm,
+                                         cpi->un_scaled_source,
+                                         &cpi->scaled_source,
+                                         &cpi->svc.scaled_temp);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm,
+                                        cpi->un_scaled_source,
+                                        &cpi->scaled_source,
+                                        (cpi->oxcf.pass == 0));
+  }
   // Avoid scaling last_source unless its needed.
   // Last source is needed if vp9_avg_source_sad() is used, or if
   // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
@@ -3780,6 +3794,25 @@
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
+    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp);
+    vp9_scale_and_extend_frame(scaled_temp, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c486ac2..2def941 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -623,6 +623,11 @@
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 193c9d3..508c596 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -387,47 +387,70 @@
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
 
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    vp9_get_token_extra(v, &prev_t, &e);
-    cost = (*token_costs)[0][pt][prev_t] +
-        vp9_get_cost(prev_t, e, cat6_high_cost);
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
 
-    token_cache[0] = vp9_pt_energy_class[prev_t];
-    ++token_costs;
+      token_cache[0] = vp9_pt_energy_class[prev_t];
+      ++token_costs;
 
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
 
-      v = qcoeff[rc];
-      vp9_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-      } else {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = vp9_pt_energy_class[t];
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
       }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
-      }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
+      // eob token
+      if (band_left)
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = vp9_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = vp9_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
+      }
+
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1d56154..79e5049 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -43,6 +43,26 @@
     cpi->svc.ext_alt_fb_idx[sl] = 2;
   }
 
+  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+  // target of 1/4x1/4.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.scaled_temp,
+                                 cpi->common.width >> 1,
+                                 cpi->common.height >> 1,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled_frame for svc ");
+  }
+
+
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
                                  SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 4e18640..f1b8556 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,6 +70,8 @@
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+  // Temp buffer used for 2-stage down-sampling, for real-time mode.
+  YV12_BUFFER_CONFIG scaled_temp;
 
   // Layer context used for rate control in one pass temporal CBR mode or
   // two pass spatial mode.
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 93be6d7..ee1d08a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -50,6 +50,35 @@
 const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
     / 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp9_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+  3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+  3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+  2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+  3197, 3116, 3058, 2977, 2881, 2800,
+  2742, 2661, 2615, 2534, 2476, 2395,
+  2299, 2218, 2160, 2079,
+  2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+  1893, 1696, 1453, 1256, 1229, 864,
+  512, 512, 512, 512, 0,
+  512, 512, 512, 512,
+  864, 1229, 1256, 1453, 1696, 1893,
+  1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+  2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+  2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+  2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+  3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp9_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index df979b2..fad7988 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -74,6 +74,7 @@
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int *vp9_dct_cat_lt_10_value_cost;
 extern const int16_t vp9_cat6_low_cost[256];
 extern const int vp9_cat6_high_cost[64];
 extern const int vp9_cat6_high10_high_cost[256];
@@ -117,6 +118,18 @@
   return vp9_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int vp9_get_token_cost(int v, int16_t *token,
+                                     const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return vp9_cat6_low_cost[extrabits & 0xff] +
+           cat6_high_table[extrabits >> 8];
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  return vp9_dct_cat_lt_10_value_cost[v];
+}
 
 #ifdef __cplusplus
 }  // extern "C"