Merge "Port "cost_coeff speed improvements" to vp9."
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 0248ede..fc775ef 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -347,8 +347,7 @@
     double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
     int                  psnr_count[NUM_ENCODERS] = {0};
 
-    double               cx_time = 0;
-    struct  timeval      tv1, tv2, difftv;
+    int64_t              cx_time = 0;
 
     /* Set the required target bitrates for each resolution level.
      * If target bitrate for highest-resolution level is set to 0,
@@ -582,6 +581,7 @@
 
     while(frame_avail || got_data)
     {
+        struct vpx_usec_timer timer;
         vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
         const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
 
@@ -636,18 +636,18 @@
             vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
         }
 
-        gettimeofday(&tv1, NULL);
         /* Encode each frame at multi-levels */
         /* Note the flags must be set to 0 in the encode call if they are set
            for each frame with the vpx_codec_control(), as done above. */
+        vpx_usec_timer_start(&timer);
         if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
             frame_cnt, 1, 0, arg_deadline))
         {
             die_codec(&codec[0], "Failed to encode frame");
         }
-        gettimeofday(&tv2, NULL);
-        timersub(&tv2, &tv1, &difftv);
-        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+        vpx_usec_timer_mark(&timer);
+        cx_time += vpx_usec_timer_elapsed(&timer);
+
         for (i=NUM_ENCODERS-1; i>=0 ; i--)
         {
             got_data = 0;
@@ -686,8 +686,10 @@
         frame_cnt++;
     }
     printf("\n");
-    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
-           1000000 * (double)frame_cnt / (double)cx_time);
+    printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+            frame_cnt,
+            1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+            1000000 * (double)frame_cnt / (double)cx_time);
 
     fclose(infile);
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index e94df20..d6b1b52 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -519,6 +519,9 @@
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.rc_target_bitrate = 200;
   cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
 
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
@@ -774,10 +777,6 @@
         svc_params_.max_quantizers[i] = 63;
         svc_params_.min_quantizers[i] = 0;
       }
-      svc_params_.scaling_factor_num[0] = 144;
-      svc_params_.scaling_factor_den[0] = 288;
-      svc_params_.scaling_factor_num[1] = 288;
-      svc_params_.scaling_factor_den[1] = 288;
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
       encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
@@ -814,8 +813,6 @@
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
       duration_ = (last_pts_ + 1) * timebase_;
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
-          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
       file_datarate_ = file_size_in_kb / duration_;
     }
   }
@@ -839,7 +836,6 @@
   int64_t bits_total_;
   double duration_;
   double file_datarate_;
-  double effective_datarate_;
   size_t bits_in_last_frame_;
   vpx_svc_extra_cfg_t svc_params_;
   int speed_setting_;
@@ -884,7 +880,7 @@
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -918,7 +914,7 @@
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
         cfg_.ts_number_layers, cfg_.temporal_layering_mode);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
             << " The datarate for the file exceeds the target by too much!";
     ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
         << " The datarate for the file is lower than the target by too much!";
@@ -926,9 +922,49 @@
   }
 }
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+     cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+}
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -949,6 +985,7 @@
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
   ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
                                        30, 1, 0, 300);
   cfg_.rc_target_bitrate = 800;
@@ -956,19 +993,59 @@
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
       cfg_.ts_number_layers, cfg_.temporal_layering_mode);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
   ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
       << " The datarate for the file is lower than the target by too much!";
   EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
 }
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+}
+
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 7));
+                          ::testing::Range(2, 9));
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
                           ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 8));
+                          ::testing::Range(5, 9));
 }  // namespace
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 113865f..5ae44e8 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,7 +23,7 @@
  */
 static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
-static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 80;
 
 /*
  * The filter function was modified to reduce the computational complexity.
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 9a379a6..f13d52f 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
+#define SUM_DIFF_THRESHOLD 384
+#define SUM_DIFF_THRESHOLD_HIGH 512
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4109c19..8953f0f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -770,6 +770,7 @@
       x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
     }
 
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     // Check if most of the superblock is skin content, and if so, force split
@@ -2439,7 +2440,8 @@
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
-  int i, pl;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
@@ -2587,7 +2589,6 @@
                      &this_rdc, bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  this_rdc.rate, this_rdc.dist);
@@ -2706,7 +2707,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -2772,7 +2772,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -2824,7 +2823,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -4271,13 +4269,9 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8[0];
+  MODE_INFO *mi = xd->mi[0];
   const int seg_skip = segfeature_active(&cm->seg, mi->segment_id,
                                          SEG_LVL_SKIP);
-  const int mis = cm->mi_stride;
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
                    cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
@@ -4333,20 +4327,14 @@
       ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
                       &td->counts->tx)[mi->tx_size];
     } else {
-      int x, y;
-      TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(mi)) {
-        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                         max_txsize_lookup[bsize]);
+        mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                             max_txsize_lookup[bsize]);
       } else {
-        tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
+        mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
       }
 
-      for (y = 0; y < mi_height; y++)
-        for (x = 0; x < mi_width; x++)
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x]->tx_size = tx_size;
     }
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f8f681a..6d6915c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -410,6 +410,9 @@
   memset(&cpi->svc.scaled_frames[0], 0,
          MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
 
+  vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+  memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
   vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
   memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 
@@ -3357,11 +3360,22 @@
   vpx_clear_system_state();
 
   set_frame_size(cpi);
-  cpi->Source = vp9_scale_if_required(cm,
-                                      cpi->un_scaled_source,
-                                      &cpi->scaled_source,
-                                      (cpi->oxcf.pass == 0));
 
+  if (is_one_pass_cbr_svc(cpi) &&
+      cpi->un_scaled_source->y_width == cm->width << 2 &&
+      cpi->un_scaled_source->y_height == cm->height << 2 &&
+      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
+      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+    cpi->Source = vp9_svc_twostage_scale(cm,
+                                         cpi->un_scaled_source,
+                                         &cpi->scaled_source,
+                                         &cpi->svc.scaled_temp);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm,
+                                        cpi->un_scaled_source,
+                                        &cpi->scaled_source,
+                                        (cpi->oxcf.pass == 0));
+  }
   // Avoid scaling last_source unless its needed.
   // Last source is needed if vp9_avg_source_sad() is used, or if
   // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
@@ -3387,12 +3401,10 @@
        cpi->oxcf.rc_mode == VPX_VBR))
     vp9_avg_source_sad(cpi);
 
-  // TODO(wonkap/marpan): For 1 pass SVC, since only ZERMOV is allowed for
-  // upsampled reference frame (i.e, svc->force_zero_mode_spatial_ref = 0),
-  // we should be able to avoid this frame-level upsampling.
-  // Keeping it for now as there is an asan error in the multi-threaded SVC
-  // rate control test if this upsampling is removed.
-  if (frame_is_intra_only(cm) == 0) {
+  // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
+  // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
+  // frame-level upsampling.
+  if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
     vp9_scale_references(cpi);
   }
 
@@ -3782,6 +3794,25 @@
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
+    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp);
+    vp9_scale_and_extend_frame(scaled_temp, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c486ac2..2def941 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -623,6 +623,11 @@
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1d56154..79e5049 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -43,6 +43,26 @@
     cpi->svc.ext_alt_fb_idx[sl] = 2;
   }
 
+  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+  // target of 1/4x1/4.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.scaled_temp,
+                                 cpi->common.width >> 1,
+                                 cpi->common.height >> 1,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled_frame for svc ");
+  }
+
+
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
                                  SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 4e18640..f1b8556 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,6 +70,8 @@
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+  // Temp buffer used for 2-stage down-sampling, for real-time mode.
+  YV12_BUFFER_CONFIG scaled_temp;
 
   // Layer context used for rate control in one pass temporal CBR mode or
   // two pass spatial mode.
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 82f566b..ebe28b8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -45,8 +45,7 @@
                                             int x, int y) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
-  const InterpKernel *const kernel =
-    vp9_filter_kernels[xd->mi[0]->interp_filter];
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
@@ -86,6 +85,7 @@
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  (void)xd;
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index b6fbfcf..7e43eb7 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -33,7 +33,7 @@
                                     int w, int h) { \
   assert(filter[3] != 128); \
   assert(step_q4 == 16); \
-  if (filter[0] || filter[1] || filter[2]) { \
+  if (filter[0] | filter[1] | filter[2]) { \
     while (w >= 16) { \
       vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                src_stride, \
@@ -45,27 +45,20 @@
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } else { \
     while (w >= 16) { \
@@ -79,27 +72,20 @@
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } \
 }
@@ -116,8 +102,7 @@
   assert(h <= 64); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
-  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
-      filter_y[0] || filter_y[1] || filter_y[2]) { \
+  if (filter_x[0] | filter_x[1] | filter_x[2]) { \
     DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
     vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
@@ -161,7 +146,7 @@
   if (step_q4 == 16 && filter[3] != 128) { \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] || filter[1] || filter[2]) { \
+    if (filter[0] | filter[1] | filter[2]) { \
       while (w >= 16) { \
         vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                         src_stride, \
@@ -253,8 +238,7 @@
   assert(w <= 64); \
   assert(h <= 64); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+    if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
       vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index b0e99ed..d2cb8ea 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -16,6 +16,11 @@
 ; %define USE_PMULHRSW
 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
 ; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
 
 SECTION .text
 %if ARCH_X86_64
@@ -77,17 +82,12 @@
 
     pmaddubsw %2, k0k1k4k5
     pmaddubsw m3, k2k3k6k7
-
-    mova      m4, %2
-    mova      m5, m3
-    psrldq    %2, 8
-    psrldq    m3, 8
-    mova      m6, m5
-
-    paddsw    m4, m3
-    pmaxsw    m5, %2
-    pminsw    %2, m6
+    mova      m4, %2        ;k0k1
+    mova      m5, m3        ;k2k3
+    psrldq    %2, 8         ;k4k5
+    psrldq    m3, 8         ;k6k7
     paddsw    %2, m4
+    paddsw    m5, m3
     paddsw    %2, m5
     paddsw    %2, krd
     psraw     %2, 7
@@ -157,27 +157,20 @@
     pmaddubsw           m7, k0k1k4k5
     palignr             m3, m2,  5
     pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4
-    mova                m5, m1
-    mova                m2, m7
-    psrldq              m4, 8
-    psrldq              m1, 8
-    mova                m6, m5
-    paddsw              m0, m1
-    mova                m1, m3
-    psrldq              m7, 8
-    psrldq              m3, 8
-    paddsw              m2, m3
-    mova                m3, m1
-    pmaxsw              m5, m4
-    pminsw              m4, m6
+    mova                m0, m4                  ;k0k1
+    mova                m5, m1                  ;k2k3
+    mova                m2, m7                  ;k0k1 upper
+    psrldq              m4, 8                   ;k4k5
+    psrldq              m1, 8                   ;k6k7
     paddsw              m4, m0
-    paddsw              m4, m5
-    pmaxsw              m1, m7
-    pminsw              m7, m3
+    paddsw              m5, m1
+    mova                m1, m3                  ;k2k3 upper
+    psrldq              m7, 8                   ;k4k5 upper
+    psrldq              m3, 8                   ;k6k7 upper
     paddsw              m7, m2
+    paddsw              m4, m5
+    paddsw              m1, m3
     paddsw              m7, m1
-
     paddsw              m4, krd
     psraw               m4, 7
     packuswb            m4, m4
@@ -240,16 +233,13 @@
     pmaddubsw   %3, k2k3
     pmaddubsw   %4, k4k5
     pmaddubsw   %5, k6k7
-
+    paddsw      %2, %4
+    paddsw      %5, %3
     paddsw      %2, %5
-    mova        %1, %3
-    pminsw      %3, %4
-    pmaxsw      %1, %4
-    paddsw      %2, %3
-    paddsw      %1, %2
-    paddsw      %1, krd
-    psraw       %1, 7
-    packuswb    %1, %1
+    paddsw      %2, krd
+    psraw       %2, 7
+    packuswb    %2, %2
+    SWAP        %1, %2
 %endm
 
 ;-------------------------------------------------------------------------------
@@ -293,39 +283,33 @@
     pmaddubsw            m3, k4k5
 
     palignr              m7, m4, 13
-    paddsw               m1, m5
-    mova                 m5, m6
-    mova                 m0, m2
-    palignr              m5, m4, 5
-    pminsw               m2, m3
-    pmaddubsw            m7, k6k7
-    pmaxsw               m3, m0
-    paddsw               m1, m2
     mova                 m0, m6
-    palignr              m6, m4, 1
-    pmaddubsw            m5, k2k3
+    palignr              m0, m4, 5
+    pmaddubsw            m7, k6k7
     paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+    mova                 m5, m6
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
     pmaddubsw            m6, k0k1
-    palignr              m0, m4, 9
+    palignr              m5, m4, 9
     paddsw               m1, krd
-    pmaddubsw            m0, k4k5
-    mova                 m4, m5
+    pmaddubsw            m5, k4k5
     psraw                m1, 7
-    pminsw               m5, m0
-    paddsw               m6, m7
+    paddsw               m0, m7
+%ifidn %1, h8_avg
+    movh                 m7, [dstq]
+    movh                 m2, [dstq + dstrideq]
+%endif
     packuswb             m1, m1
-
     paddsw               m6, m5
-    pmaxsw               m0, m4
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
     packuswb             m6, m6
-
 %ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    movh                 m2, [dstq + dstrideq]
-    pavgb                m1, m0
+    pavgb                m1, m7
     pavgb                m6, m2
 %endif
     movh             [dstq], m1
@@ -388,7 +372,7 @@
     pmaddubsw     m1, k2k3
     palignr       m2, m7, 9
     pmaddubsw     m2, k4k5
-    paddsw        m0, m3
+    paddsw        m1, m3
     mova          m3, m4
     punpckhbw     m4, m4
     mova          m5, m4
@@ -403,17 +387,13 @@
     pmaddubsw     m6, k4k5
     palignr       m7, m3, 13
     pmaddubsw     m7, k6k7
-
-    mova          m3, m1
-    pmaxsw        m1, m2
-    pminsw        m2, m3
     paddsw        m0, m2
     paddsw        m0, m1
-    paddsw        m4, m7
-    mova          m7, m5
-    pmaxsw        m5, m6
-    pminsw        m6, m7
+%ifidn %1, h8_avg
+    mova          m1, [dstq]
+%endif
     paddsw        m4, m6
+    paddsw        m5, m7
     paddsw        m4, m5
     paddsw        m0, krd
     paddsw        m4, krd
@@ -421,7 +401,6 @@
     psraw         m4, 7
     packuswb      m0, m4
 %ifidn %1, h8_avg
-    mova          m1, [dstq]
     pavgb         m0, m1
 %endif
     lea         srcq, [srcq + sstrideq]
@@ -488,27 +467,21 @@
     movx         m7, [src1q + sstride6q   ]     ;H
     punpcklbw    m6, m7                         ;G H
     pmaddubsw    m6, k6k7
-    mova        tmp, m2
     pmaddubsw    m3, k2k3
     pmaddubsw    m1, k0k1
-    pmaxsw       m2, m4
-    paddsw       m0, m6
+    paddsw       m0, m4
+    paddsw       m2, m6
     movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
     punpcklbw    m7, m6
     pmaddubsw    m7, k6k7
-    pminsw       m4, tmp
-    paddsw       m0, m4
-    mova         m4, m3
     paddsw       m0, m2
-    pminsw       m3, m5
-    pmaxsw       m5, m4
     paddsw       m0, krd
     psraw        m0, 7
-    paddsw       m1, m7
+    paddsw       m1, m5
     packuswb     m0, m0
 
+    paddsw       m3, m7
     paddsw       m1, m3
-    paddsw       m1, m5
     paddsw       m1, krd
     psraw        m1, 7
     lea        srcq, [srcq + sstrideq * 2 ]
@@ -550,10 +523,7 @@
     punpcklbw    m4, m5                         ;E F
     pmaddubsw    m2, k2k3
     pmaddubsw    m4, k4k5
-    paddsw       m0, m6
-    mova         m1, m2
-    pmaxsw       m2, m4
-    pminsw       m4, m1
+    paddsw       m2, m6
     paddsw       m0, m4
     paddsw       m0, m2
     paddsw       m0, krd
@@ -572,7 +542,6 @@
 %macro SUBPIX_VFILTER16 1
 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
-
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
 %if ARCH_X86_64
@@ -611,12 +580,9 @@
     punpcklbw     m3, m5                         ;A B
     movh          m7, [srcq + sstrideq * 2 + 8]  ;C
     pmaddubsw     m6, k6k7
-    mova          m1, m2
     movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    pmaxsw        m2, m4
     punpcklbw     m7, m5                         ;C D
-    pminsw        m4, m1
-    paddsw        m0, m6
+    paddsw        m2, m6
     pmaddubsw     m3, k0k1
     movh          m1, [srcq + sstrideq * 4 + 8]  ;E
     paddsw        m0, m4
@@ -630,30 +596,24 @@
     movh          m5, [src1q + sstride6q + 8]    ;H
     psraw         m0, 7
     punpcklbw     m2, m5                         ;G H
-    packuswb      m0, m0
     pmaddubsw     m2, k6k7
 %ifidn %1, v8_avg
-    movh          m4, [dstq]
-    pavgb         m0, m4
+    mova          m4, [dstq]
 %endif
     movh      [dstq], m0
-    mova          m6, m7
-    pmaxsw        m7, m1
-    pminsw        m1, m6
-    paddsw        m3, m2
+    paddsw        m7, m2
     paddsw        m3, m1
     paddsw        m3, m7
     paddsw        m3, krd
     psraw         m3, 7
-    packuswb      m3, m3
+    packuswb      m0, m3
 
     add         srcq, sstrideq
     add        src1q, sstrideq
 %ifidn %1, v8_avg
-    movh          m1, [dstq + 8]
-    pavgb         m3, m1
+    pavgb         m0, m4
 %endif
-    movh  [dstq + 8], m3
+    mova      [dstq], m0
     add         dstq, dst_stride
     dec      heightd
     jnz        .loop