Merge "vp9_denoiser_sse2.c: improve code style."
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 5377c1e..c7042fe 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -625,6 +625,20 @@
 
 #if HAVE_AVX2
 #if CONFIG_VP9_ENCODER
+const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2;
+const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2;
+const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2;
+const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2;
+const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2;
+const SadMxNVp9Param avx2_vp9_tests[] = {
+  make_tuple(64, 64, sad_64x64_avx2_vp9),
+  make_tuple(64, 32, sad_64x32_avx2_vp9),
+  make_tuple(32, 64, sad_32x64_avx2_vp9),
+  make_tuple(32, 32, sad_32x32_avx2_vp9),
+  make_tuple(32, 16, sad_32x16_avx2_vp9),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests));
+
 const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
 const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
diff --git a/test/test-data.mk b/test/test-data.mk
index e4dae3a..e2da193 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -653,8 +653,30 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
+endif  # CONFIG_VP9_HIGHBITDEPTH
 
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 69d1d2f..3d1cd65 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -670,8 +670,28 @@
 5661b0168752969f055eec37b05fa9fa947dc7eb  vp90-2-16-intra-only.webm.md5
 c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa  vp90-2-17-show-existing-frame.webm
 cc75f351818b9a619818f5cc77b9bc013d0c1e11  vp90-2-17-show-existing-frame.webm.md5
+013708bd043f0821a3e56fb8404d82e7a0c7af6c  vp91-2-04-yuv422.webm
+1e58a7d23adad830a672f1733c9d2ae17890d59c  vp91-2-04-yuv422.webm.md5
+25d78f28948789d159a9453ebc13048b818251b1  vp91-2-04-yuv440.webm
+81b3870b27a7f695ef6a43e87ab04bbdb5aee2f5  vp91-2-04-yuv440.webm.md5
 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
 367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
+f77673b566f686853adefe0c578ad251b7241281  vp92-2-20-10bit-yuv420.webm
+abdedfaddacbbe1a15ac7a54e86360f03629fb7a  vp92-2-20-10bit-yuv420.webm.md5
+0c2c355a1b17b28537c5a3b19997c8783b69f1af  vp92-2-20-12bit-yuv420.webm
+afb2c2798703e039189b0a15c8ac5685aa51d33f  vp92-2-20-12bit-yuv420.webm.md5
+0d661bc6e83da33238981481efd1b1802d323d88  vp93-2-20-10bit-yuv422.webm
+10318907063db22eb02fad332556edbbecd443cc  vp93-2-20-10bit-yuv422.webm.md5
+ebc6be2f7511a0bdeac0b18c67f84ba7168839c7  vp93-2-20-12bit-yuv422.webm
+235232267c6a1dc8a11e45d600f1c99d2f8b42d4  vp93-2-20-12bit-yuv422.webm.md5
+f76b11b26d4beaceac7a7e7729dd5054d095164f  vp93-2-20-10bit-yuv440.webm
+757b33b5ac969c5999999488a731a3d1e6d9fb88  vp93-2-20-10bit-yuv440.webm.md5
+df8807dbd29bec795c2db9c3c18e511fbb988101  vp93-2-20-12bit-yuv440.webm
+ea4100930c3f59a1c23fbb33ab0ea01151cae159  vp93-2-20-12bit-yuv440.webm.md5
+189c1b5f404ff41a50a7fc96341085ad541314a9  vp93-2-20-10bit-yuv444.webm
+2dd0177c2f9d970b6e698892634c653630f91f40  vp93-2-20-10bit-yuv444.webm.md5
+bd44cf6e1c27343e3639df9ac21346aedd5d6973  vp93-2-20-12bit-yuv444.webm
+f36e5bdf5ec3213f32c0ddc82f95d82c5133bf27  vp93-2-20-12bit-yuv444.webm.md5
 eb438c6540eb429f74404eedfa3228d409c57874  desktop_640_360_30.yuv
 89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab  kirland_640_480_30.yuv
 33c533192759e5bb4f07abfbac389dc259db4686  macmarcomoving_640_480_30.yuv
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 7efa8c0..432522c 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -184,6 +184,13 @@
   "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
   "vp90-2-19-skip-01.webm", "vp90-2-19-skip-02.webm",
   "vp91-2-04-yuv444.webm",
+  "vp91-2-04-yuv422.webm", "vp91-2-04-yuv440.webm",
+#if CONFIG_VP9_HIGHBITDEPTH
+  "vp92-2-20-10bit-yuv420.webm", "vp92-2-20-12bit-yuv420.webm",
+  "vp93-2-20-10bit-yuv422.webm", "vp93-2-20-12bit-yuv422.webm",
+  "vp93-2-20-10bit-yuv440.webm", "vp93-2-20-12bit-yuv440.webm",
+  "vp93-2-20-10bit-yuv444.webm", "vp93-2-20-12bit-yuv444.webm",
+#endif  // CONFIG_VP9_HIGHBITDEPTH`
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 12f9734..75b2a3b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -390,9 +390,9 @@
     denoiser->denoise_pars.scale_motion_thresh = 16;
     denoiser->denoise_pars.scale_increase_filter = 1;
     denoiser->denoise_pars.denoise_mv_bias = 60;
-    denoiser->denoise_pars.pickmode_mv_bias = 60;
-    denoiser->denoise_pars.qp_thresh = 100;
-    denoiser->denoise_pars.consec_zerolast = 10;
+    denoiser->denoise_pars.pickmode_mv_bias = 75;
+    denoiser->denoise_pars.qp_thresh = 85;
+    denoiser->denoise_pars.consec_zerolast = 15;
     denoiser->denoise_pars.spatial_blur = 20;
   }
 }
@@ -453,17 +453,17 @@
     // Bitrate thresholds and noise metric (nmse) thresholds for switching to
     // aggressive mode.
     // TODO(marpan): Adjust thresholds, including effect on resolution.
-    denoiser->bitrate_threshold = 200000;  // (bits/sec).
+    denoiser->bitrate_threshold = 300000;  // (bits/sec).
     denoiser->threshold_aggressive_mode = 35;
-    if (width * height > 640 * 480) {
-      denoiser->bitrate_threshold = 500000;
-      denoiser->threshold_aggressive_mode = 100;
+    if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 2000000;
+      denoiser->threshold_aggressive_mode = 1400;
     } else if (width * height > 960 * 540) {
       denoiser->bitrate_threshold = 800000;
       denoiser->threshold_aggressive_mode = 150;
-    } else if (width * height > 1280 * 720) {
-      denoiser->bitrate_threshold = 2000000;
-      denoiser->threshold_aggressive_mode = 1400;
+    } else if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 500000;
+      denoiser->threshold_aggressive_mode = 100;
     }
     return 0;
 }
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index d8eff66..010c01a 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3334,11 +3334,6 @@
       int index = block_index_row + (j >> 4);
       if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
         unsigned int sse;
-        const unsigned int mse = vp8_mse16x16(src + j,
-                                              ystride,
-                                              dst + j,
-                                              ystride,
-                                              &sse);
         const unsigned int var = vp8_variance16x16(src + j,
                                                    ystride,
                                                    dst + j,
@@ -3347,14 +3342,15 @@
         // Only consider this block as valid for noise measurement
         // if the sum_diff average of the current and previous frame
         // is small (to avoid effects from lighting change).
-        if ((mse - var) < 256) {
+        if ((sse - var) < 256) {
+          unsigned int sse2;
           const unsigned int act = vp8_variance16x16(src + j,
                                                      ystride,
                                                      const_source,
                                                      0,
-                                                     &sse);
+                                                     &sse2);
           if (act > 0)
-            total += mse / act;
+            total += sse / act;
           num_blocks++;
         }
       }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index de389e7..0530f3a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -931,22 +931,22 @@
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad64x64 neon/, "$sse2_x86inc";
+specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x64/, "$sse2_x86inc";
+specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x32/, "$sse2_x86inc";
+specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x16/, "$sse2_x86inc";
+specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad16x32/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
+specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
@@ -970,22 +970,22 @@
 specialize qw/vp9_sad4x4/, "$sse_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
+specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
@@ -1114,6 +1114,11 @@
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2/;
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_8x8/;
+}
+
 # ENCODEMB INVOKE
 
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 3c9469c..baf6ab7 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -232,6 +232,8 @@
       cm->frame_refs[0].buf->corrupted = 1;
   }
 
+  pbi->ready_for_new_data = 0;
+
   // Check if the previous frame was a frame without any references to it.
   if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
     cm->release_fb_cb(cm->cb_priv,
@@ -279,8 +281,6 @@
     cm->current_video_frame++;
   }
 
-  pbi->ready_for_new_data = 0;
-
   cm->error.setjmp = 0;
   return retcode;
 }
@@ -296,12 +296,12 @@
   if (pbi->ready_for_new_data == 1)
     return ret;
 
+  pbi->ready_for_new_data = 1;
+
   /* no raw frame to show!!! */
   if (!cm->show_frame)
     return ret;
 
-  pbi->ready_for_new_data = 1;
-
 #if CONFIG_VP9_POSTPROC
   if (!cm->show_existing_frame) {
     ret = vp9_post_proc_frame(cm, sd, flags);
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index 22c6cc4..e9810c8 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vp9/common/vp9_common.h"
 #include "vpx_ports/mem.h"
 
 unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
@@ -17,3 +18,16 @@
 
   return (sum + 32) >> 6;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3954fe6..421e049 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -935,26 +935,27 @@
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+  TileInfo tile[4][1 << 6];
+  TOKENEXTRA *pre_tok = cpi->tok;
+  int tile_tok = 0;
 
   vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
-  tok[0][0] = cpi->tok;
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    if (tile_row)
-      tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] +
-                         cpi->tok_count[tile_row - 1][tile_cols - 1];
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
 
-    for (tile_col = 1; tile_col < tile_cols; tile_col++)
-      tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
-                                cpi->tok_count[tile_row][tile_col - 1];
+      tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+    }
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      TileInfo tile;
+      const TileInfo * const ptile = &tile[tile_row][tile_col];
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -962,7 +963,7 @@
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
+      write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end);
       assert(tok[tile_row][tile_col] == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index b488261..4f0e46f 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -29,7 +29,7 @@
           (is_two_pass_svc(cpi) &&
            cpi->svc.spatial_layer_id == 0 &&
            cpi->svc.layer_context[0].gold_ref_idx >=0 &&
-           cpi->oxcf.ss_play_alternate[0]));
+           cpi->oxcf.ss_enable_auto_arf[0]));
 }
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 6b28ee5..47d9580 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -34,6 +34,7 @@
   int is_coded;
   int num_4x4_blk;
   int skip;
+  int pred_pixel_ready;
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b4fe70a..a7370bf 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -533,8 +533,19 @@
         int sum = 0;
 
         if (x_idx < pixels_wide && y_idx < pixels_high) {
-          int s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-          int d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+          int s_avg, d_avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
+          } else {
+            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+          }
+#else
+          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
+          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+#endif
           sum = s_avg - d_avg;
           sse = sum * sum;
         }
@@ -739,8 +750,8 @@
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
-                                   int64_t *dist, BLOCK_SIZE bsize) {
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
   INTERP_FILTER filter_ref;
@@ -766,8 +777,7 @@
   xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
 
-  *rate = 0;
-  *dist = 0;
+  vp9_rd_cost_init(rd_cost);
 }
 
 static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
@@ -802,6 +812,7 @@
   }
   ctx->is_coded = 0;
   ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
@@ -1150,79 +1161,6 @@
   }
 }
 
-static void copy_partitioning(VP9_COMMON *cm, MODE_INFO *mi_8x8,
-  MODE_INFO *prev_mi_8x8) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO *const prev_mi =
-          prev_mi_8x8[block_row * mis + block_col].src_mi;
-      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-
-      if (prev_mi) {
-        const ptrdiff_t offset = prev_mi - cm->prev_mi;
-        mi_8x8[block_row * mis + block_col].src_mi = cm->mi + offset;
-        mi_8x8[block_row * mis + block_col].src_mi->mbmi.sb_type = sb_type;
-      }
-    }
-  }
-}
-
-static void constrain_copy_partitioning(VP9_COMP *const cpi,
-                                        const TileInfo *const tile,
-                                        MODE_INFO *mi_8x8,
-                                        MODE_INFO *prev_mi_8x8,
-                                        int mi_row, int mi_col,
-                                        BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  int block_row, block_col;
-
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
-  // If the SB64 if it is all "in image".
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        const int index = block_row * mis + block_col;
-        MODE_INFO *prev_mi = prev_mi_8x8[index].src_mi;
-        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-        // Use previous partition if block size is not larger than bsize.
-        if (prev_mi && sb_type <= bsize) {
-          int block_row2, block_col2;
-          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
-            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
-              const int index2 = (block_row + block_row2) * mis +
-                  block_col + block_col2;
-              prev_mi = prev_mi_8x8[index2].src_mi;
-              if (prev_mi) {
-                const ptrdiff_t offset = prev_mi - cm->prev_mi;
-                mi_8x8[index2].src_mi = cm->mi + offset;
-                mi_8x8[index2].src_mi->mbmi.sb_type = prev_mi->mbmi.sb_type;
-              }
-            }
-          }
-        } else {
-          // Otherwise, use fixed partition of size bsize.
-          mi_8x8[index].src_mi = mi_upper_left + index;
-          mi_8x8[index].src_mi->mbmi.sb_type = bsize;
-        }
-      }
-    }
-  } else {
-    // Else this is a partial SB64, copy previous partition.
-    copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-  }
-}
-
 const struct {
   int row;
   int col;
@@ -1353,27 +1291,6 @@
   return this_sad < 2 * threshold;
 }
 
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO *prev_mi_8x8,
-                         const int motion_thresh) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  if (cm->prev_mi) {
-    for (block_row = 0; block_row < 8; ++block_row) {
-      for (block_col = 0; block_col < 8; ++block_col) {
-        const MODE_INFO *prev_mi =
-            prev_mi_8x8[block_row * mis + block_col].src_mi;
-        if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
-            return 1;
-        }
-      }
-    }
-  }
-  return 0;
-}
-
 static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                             int mi_row, int mi_col, int bsize) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2542,10 +2459,6 @@
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO *mi = cm->mi + idx_str;
-    MODE_INFO *prev_mi = NULL;
-
-    if (cm->frame_type != KEY_FRAME)
-      prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i)
@@ -2562,9 +2475,6 @@
     vp9_zero(cpi->mb.pred_mv);
     cpi->pc_root->index = 0;
 
-    // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
-    // quality encoding. Need to evaluate it in real-time encoding later to
-    // decide if it can be removed too. And then, do the code cleanup.
     cpi->mb.source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION) {
       set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
@@ -2572,38 +2482,18 @@
                              sf->always_this_block_size);
       rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-    } else if (cpi->partition_search_skippable_frame ||
-               sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+    } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
       set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
       bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-                 cm->frame_type != KEY_FRAME ) {
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME ) {
       choose_partitioning(cpi, tile, mi_row, mi_col);
       rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-    } else if (sf->partition_search_type == SEARCH_PARTITION &&
-               sf->use_lastframe_partitioning &&
-               (cpi->rc.frames_since_key %
-                   sf->last_partitioning_redo_frequency) &&
-               cm->prev_mi &&
-               cm->show_frame &&
-               cm->frame_type != KEY_FRAME &&
-               !cpi->rc.is_src_frame_alt_ref &&
-               ((sf->use_lastframe_partitioning !=
-                   LAST_FRAME_PARTITION_LOW_MOTION) ||
-                   !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
-      if (sf->constrain_copy_partition &&
-          sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
-        constrain_copy_partitioning(cpi, tile, mi, prev_mi,
-                                    mi_row, mi_col, BLOCK_16X16);
-      else
-        copy_partitioning(cm, mi, prev_mi);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
@@ -2686,8 +2576,7 @@
 }
 
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, int mi_col,
-                                int *rate, int64_t *dist,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -2702,11 +2591,14 @@
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
   else
-    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx);
+    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+  if (rd_cost->rate == INT_MAX)
+    vp9_rd_cost_reset(rd_cost);
 }
 
 static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
@@ -2768,8 +2660,8 @@
 
 static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                                  TOKENEXTRA **tp, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize, int *rate,
-                                 int64_t *dist, int do_recon, int64_t best_rd,
+                                 int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+                                 int do_recon, int64_t best_rd,
                                  PC_TREE *pc_tree) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -2781,9 +2673,7 @@
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i;
   BLOCK_SIZE subsize = bsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
@@ -2802,6 +2692,10 @@
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (sf->auto_min_max_partition_size) {
@@ -2823,33 +2717,33 @@
   // PARTITION_NONE
   if (partition_none_allowed) {
     nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, bsize, ctx);
+                        &this_rdc, bsize, ctx);
     ctx->mic.mbmi = xd->mi[0].src_mi->mbmi;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
+    ctx->pred_pixel_ready = 0;
 
-    if (this_rate != INT_MAX) {
+    if (this_rdc.rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      this_rate += cpi->partition_cost[pl][PARTITION_NONE];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
-      if (sum_rd < best_rd) {
-        int64_t stop_thresh = 4096;
-        int64_t stop_thresh_rd;
+      this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
-        // Adjust threshold according to partition size.
-        stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
-
-        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
-        // If obtained distortion is very small, choose current partition
-        // and stop splitting.
-        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+        if (!x->e_mbd.lossless &&
+            this_rdc.rate < rate_breakout_thr &&
+            this_rdc.dist < dist_breakout_thr) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2861,12 +2755,12 @@
   store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   if (do_split) {
     int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
     subsize = get_subsize(bsize, PARTITION_SPLIT);
-    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+    for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * ms;
       const int y_idx = (i >> 1) * ms;
 
@@ -2874,22 +2768,20 @@
         continue;
       load_pred_mv(x, ctx);
       nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                           subsize, &this_rate, &this_dist, 0,
-                           best_rd - sum_rd, pc_tree->split[i]);
+                           subsize, &this_rdc, 0,
+                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
 
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_SPLIT;
     } else {
       // skip rectangular partition test when larger block size
@@ -2905,40 +2797,38 @@
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
+    pc_tree->horizontal[0].pred_pixel_ready = 0;
 
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
-                          &this_rate, &this_dist, subsize,
+      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
 
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
+      pc_tree->horizontal[1].pred_pixel_ready = 0;
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rd = sum_rd;
-      best_rate = sum_rate;
-      best_dist = sum_dist;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_HORZ;
     }
   }
@@ -2950,55 +2840,53 @@
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+    pc_tree->vertical[0].pred_pixel_ready = 0;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
-                          &this_rate, &this_dist, subsize,
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rdc, subsize,
                           &pc_tree->vertical[1]);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      pc_tree->vertical[1].pred_pixel_ready = 0;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_VERT];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_VERT;
     }
   }
-  // TODO(JBB): The following line is here just to avoid a static warning
-  // that occurs because at this point we never again reuse best_rd
-  // despite setting it here.  The code should be refactored to avoid this.
-  (void) best_rd;
 
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate == INT_MAX)
+  if (best_rdc.rate == INT_MAX) {
+    vp9_rd_cost_reset(rd_cost);
     return;
+  }
 
   // update mode info array
   subsize = get_subsize(bsize, pc_tree->partitioning);
   fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
                     pc_tree);
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
 
     // Check the projected output rate for this SB against it's target
@@ -3006,33 +2894,32 @@
     // closer to the target.
     if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
       vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
+                                    best_rdc.rate);
     }
 
     if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
+                                              best_rdc.rate, best_rdc.dist);
 
     encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-static void nonrd_use_partition(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MODE_INFO *mi,
-                                TOKENEXTRA **tp,
-                                int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, int output_enabled,
-                                int *totrate, int64_t *totdist,
-                                PC_TREE *pc_tree) {
+static void nonrd_select_partition(VP9_COMP *cpi,
+                                   const TileInfo *const tile,
+                                   MODE_INFO *mi,
+                                   TOKENEXTRA **tp,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, int output_enabled,
+                                   RD_COST *rd_cost, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -3040,9 +2927,140 @@
   const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
+  RD_COST this_rdc;
 
+  vp9_rd_cost_reset(&this_rdc);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0].src_mi->mbmi.sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+      subsize >= BLOCK_16X16) {
+    cpi->sf.max_partition_size = BLOCK_32X32;
+    cpi->sf.min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+    cpi->sf.max_partition_size = BLOCK_16X16;
+    cpi->sf.min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->none);
+        pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->none.skip = x->skip;
+        pc_tree->none.pred_pixel_ready = 1;
+        break;
+      case PARTITION_VERT:
+        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->vertical[0]);
+        pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[0].skip = x->skip;
+        pc_tree->vertical[0].pred_pixel_ready = 1;
+        if (mi_col + hbs < cm->mi_cols) {
+          nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
+                              &this_rdc, subsize, &pc_tree->vertical[1]);
+          pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+          pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->vertical[1].skip = x->skip;
+          pc_tree->vertical[1].pred_pixel_ready = 1;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_HORZ:
+        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->horizontal[0]);
+        pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+        pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[0].skip = x->skip;
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
+        if (mi_row + hbs < cm->mi_rows) {
+          nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
+                              &this_rdc, subsize, &pc_tree->horizontal[0]);
+          pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+          pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->horizontal[1].skip = x->skip;
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_SPLIT:
+        subsize = get_subsize(bsize, PARTITION_SPLIT);
+        nonrd_select_partition(cpi, tile, mi, tp, mi_row, mi_col,
+                               subsize, output_enabled, rd_cost,
+                               pc_tree->split[0]);
+        nonrd_select_partition(cpi, tile, mi + hbs, tp,
+                               mi_row, mi_col + hbs, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[1]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, tile, mi + hbs * mis, tp,
+                               mi_row + hbs, mi_col, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[2]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, tile, mi + hbs * mis + hbs, tp,
+                               mi_row + hbs, mi_col + hbs, subsize,
+                               output_enabled, &this_rdc, pc_tree->split[3]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        break;
+      default:
+        assert("Invalid partition type.");
+        break;
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && output_enabled) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              rd_cost->rate, rd_cost->dist);
+    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
+  }
+}
+
+
+static void nonrd_use_partition(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                MODE_INFO *mi,
+                                TOKENEXTRA **tp,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int output_enabled,
+                                RD_COST *rd_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc;
+
+  vp9_rd_cost_reset(&this_rdc);
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -3051,78 +3069,78 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->none);
       pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
       break;
     case PARTITION_VERT:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->vertical[0]);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
       if (mi_col + hbs < cm->mi_cols) {
         nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
-                            &rate, &dist, subsize, &pc_tree->vertical[1]);
+                            &this_rdc, subsize, &pc_tree->vertical[1]);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
         }
       }
       break;
     case PARTITION_HORZ:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, rd_cost,
                           subsize, &pc_tree->horizontal[0]);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
       if (mi_row + hbs < cm->mi_rows) {
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
-                            &rate, &dist, subsize, &pc_tree->horizontal[0]);
+                            &this_rdc, subsize, &pc_tree->horizontal[0]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
         }
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                          subsize, output_enabled, totrate, totdist,
+                          subsize, output_enabled, rd_cost,
                           pc_tree->split[0]);
       nonrd_use_partition(cpi, tile, mi + hbs, tp,
                           mi_row, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[1]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[1]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
       nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
                           mi_row + hbs, mi_col, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[2]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[2]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
       nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
                           mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[3]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+                          &this_rdc, pc_tree->split[3]);
+      if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+          rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+        rd_cost->rate += this_rdc.rate;
+        rd_cost->dist += this_rdc.dist;
       }
       break;
     default:
@@ -3133,7 +3151,7 @@
   if (bsize == BLOCK_64X64 && output_enabled) {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              *totrate, *totdist);
+                                              rd_cost->rate, rd_cost->dist);
     encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
   }
 }
@@ -3153,52 +3171,53 @@
   // Code each SB in the row
   for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate = 0;
-    int64_t dummy_dist = 0;
+    RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO *mi = cm->mi + idx_str;
     BLOCK_SIZE bsize;
     x->in_static_area = 0;
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
+    vp9_rd_cost_init(&dummy_rdc);
 
     // Set the partition type of the 64X64 block
     switch (sf->partition_search_type) {
       case VAR_BASED_PARTITION:
         choose_partitioning(cpi, tile, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+                            1, &dummy_rdc, cpi->pc_root);
         break;
       case SOURCE_VAR_BASED_PARTITION:
         set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+                            1, &dummy_rdc, cpi->pc_root);
         break;
-      case VAR_BASED_FIXED_PARTITION:
       case FIXED_PARTITION:
         bsize = sf->partition_search_type == FIXED_PARTITION ?
                 sf->always_this_block_size :
                 get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
         set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+                            1, &dummy_rdc, cpi->pc_root);
         break;
       case REFERENCE_PARTITION:
-        if (sf->partition_check ||
-            !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
-          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        x->in_static_area = is_background(cpi, tile, mi_row, mi_col);
+
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0].src_mi->mbmi.segment_id && x->in_static_area) {
           auto_partition_range(cpi, tile, mi_row, mi_col,
                                &sf->min_partition_size,
                                &sf->max_partition_size);
           nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                               &dummy_rate, &dummy_dist, 1, INT64_MAX,
-                               cpi->pc_root);
+                               &dummy_rdc, 1,
+                               INT64_MAX, cpi->pc_root);
         } else {
           choose_partitioning(cpi, tile, mi_row, mi_col);
-          nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
-                              cpi->pc_root);
+          nonrd_select_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                                 1, &dummy_rdc, cpi->pc_root);
         }
+
         break;
       default:
         assert(0);
@@ -3338,25 +3357,39 @@
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+
   int tile_col, tile_row;
-  TOKENEXTRA *tok = cpi->tok;
+  TileInfo tile[4][1 << 6];
+  TOKENEXTRA *tok[4][1 << 6];
+  TOKENEXTRA *pre_tok = cpi->tok;
+  int tile_tok = 0;
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo tile;
-      TOKENEXTRA *old_tok = tok;
+      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
+
+      tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
+    }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileInfo * const ptile = &tile[tile_row][tile_col];
+      TOKENEXTRA * const old_tok = tok[tile_row][tile_col];
       int mi_row;
 
-      vp9_tile_init(&tile, cm, tile_row, tile_col);
-      for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+      for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end;
            mi_row += MI_BLOCK_SIZE) {
         if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
-          encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
+          encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
         else
-          encode_rd_sb_row(cpi, &tile, mi_row, &tok);
+          encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
       }
-      cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok);
-      assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+      cpi->tok_count[tile_row][tile_col] =
+          (unsigned int)(tok[tile_row][tile_col] - old_tok);
+      assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile));
     }
   }
 }
@@ -3714,7 +3747,7 @@
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1758e3f..c5e8726 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1436,25 +1436,6 @@
 
   cpi->refresh_alt_ref_frame = 0;
 
-  // Note that at the moment multi_arf will not work with svc.
-  // For the current check in all the execution paths are defaulted to 0
-  // pending further tuning and testing. The code is left in place here
-  // as a place holder in regard to the required paths.
-  cpi->multi_arf_last_grp_enabled = 0;
-  if (oxcf->pass == 2) {
-    if (cpi->use_svc) {
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    } else {
-      // Disable by default for now.
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    }
-  } else {
-    cpi->multi_arf_allowed = 0;
-    cpi->multi_arf_enabled = 0;
-  }
-
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
@@ -3431,6 +3412,16 @@
 
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR and
+  // will not work properly with svc.
+  if ((oxcf->pass == 2) && !cpi->use_svc &&
+      (cpi->oxcf.enable_auto_arf > 1) && (cpi->oxcf.rc_mode == VPX_VBR))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
@@ -3456,7 +3447,7 @@
         int i;
         // Reference a hidden frame from a lower layer
         for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (oxcf->ss_play_alternate[i]) {
+          if (oxcf->ss_enable_auto_arf[i]) {
             cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
             break;
           }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 6091ae2..1e60474 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -178,13 +178,12 @@
   int ts_number_layers;  // Number of temporal layers.
   // Bitrate allocation for spatial layers.
   int ss_target_bitrate[VPX_SS_MAX_LAYERS];
-  int ss_play_alternate[VPX_SS_MAX_LAYERS];
+  int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
   // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
   int ts_target_bitrate[VPX_TS_MAX_LAYERS];
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 
-  // these parameters aren't to be used in final build don't use!!!
-  int play_alternate;
+  int enable_auto_arf;
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -481,6 +480,15 @@
   return mb_rows * mb_cols * (16 * 16 * 3 + 4);
 }
 
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
 int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
 int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
@@ -511,9 +519,9 @@
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
-         (cpi->oxcf.play_alternate &&
+         (cpi->oxcf.enable_auto_arf &&
           (!is_two_pass_svc(cpi) ||
-           cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
+           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
 }
 
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 96c3e0a..f1baf83 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -66,13 +66,6 @@
   *b = temp;
 }
 
-static int gfboost_qadjust(int qindex, vpx_bit_depth_t bit_depth) {
-  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
 static void reset_fpf_position(TWO_PASS *p,
@@ -1317,14 +1310,15 @@
                                double this_frame_mv_in_out,
                                double max_boost) {
   double frame_boost;
-  const double lq = vp9_convert_qindex_to_q(cpi->rc.last_q[INTER_FRAME],
-                                            cpi->common.bit_depth);
-  const double q_correction = MIN((0.8 + (lq * 0.001)), 1.0);
+  const double lq =
+    vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                            cpi->common.bit_depth);
+  const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5);
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
-  frame_boost = frame_boost * BOOST_FACTOR * q_correction;
+  frame_boost = frame_boost * BOOST_FACTOR * boost_correction;
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
   // Slightly reduce boost if there is a net balance of motion out of the frame
@@ -1335,7 +1329,7 @@
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  return MIN(frame_boost, max_boost * q_correction);
+  return MIN(frame_boost, max_boost * boost_correction);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1874,19 +1868,8 @@
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Calculate the extra bits to be used for boosted frame(s)
-  {
-    int q = rc->last_q[INTER_FRAME];
-    int boost =
-        (rc->gfu_boost * gfboost_qadjust(q, cpi->common.bit_depth)) / 100;
-
-    // Set max and minimum boost and hence minimum allocation.
-    boost = clamp(boost, MIN_ARF_GF_BOOST,
-                  (rc->baseline_gf_interval + 1) * 200);
-
-    // Calculate the extra bits to be used for boosted frame(s)
-    gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
-                                       boost, gf_group_bits);
-  }
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                     rc->gfu_boost, gf_group_bits);
 
   // Adjust KF group bits and error remaining.
   twopass->kf_group_error_left -= (int64_t)gf_group_err;
@@ -2380,7 +2363,11 @@
                                                 section_target_bandwidth);
     twopass->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b74b2dd..9d20bae 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -460,11 +460,8 @@
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          const TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
-                         BLOCK_SIZE bsize,
-                         PICK_MODE_CONTEXT *ctx) {
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
@@ -478,11 +475,8 @@
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd = INT64_MAX;
+  RD_COST this_rdc, best_rdc;
   uint8_t skip_txfm = 0;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
@@ -543,8 +537,9 @@
   x->skip = 0;
 
   // initialize mode decisions
-  *returnrate = INT_MAX;
-  *returndistortion = INT64_MAX;
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(&this_rdc);
+  vp9_rd_cost_reset(rd_cost);
   vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
@@ -614,17 +609,17 @@
       mode_rd_thresh =
           rd_threshes[mode_idx[ref_frame -
                                LAST_FRAME][INTER_OFFSET(this_mode)]];
-      if (rd_less_than_thresh(best_rd, mode_rd_thresh,
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
                               rd_thresh_freq_fact[this_mode]))
         continue;
 
       if (this_mode == NEWMV) {
         if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
-            this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+            this_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
         if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                     &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rd))
+                                    &rate_mv, best_rdc.rdcost))
           continue;
       }
 
@@ -697,30 +692,34 @@
 
         mbmi->interp_filter = best_filter;
         mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
-        rate = pf_rate[mbmi->interp_filter];
-        dist = pf_dist[mbmi->interp_filter];
+        this_rdc.rate = pf_rate[mbmi->interp_filter];
+        this_rdc.dist = pf_dist[mbmi->interp_filter];
         var_y = pf_var[mbmi->interp_filter];
         sse_y = pf_sse[mbmi->interp_filter];
         x->skip_txfm[0] = skip_txfm;
       } else {
         mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
       }
 
-      rate += rate_mv;
-      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+      this_rdc.rate += rate_mv;
+      this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
                                   [INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
 
       // Skipping checking: test to see if this block can be reconstructed by
       // prediction only.
       if (cpi->allow_encode_breakout) {
         encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
+                             this_mode, var_y, sse_y, yv12_mb,
+                             &this_rdc.rate, &this_rdc.dist);
         if (x->skip) {
-          rate += rate_mv;
-          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+          this_rdc.rate += rate_mv;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
         }
       }
 
@@ -732,10 +731,8 @@
       (void)ctx;
 #endif
 
-      if (this_rd < best_rd || x->skip) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
+      if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+        best_rdc = this_rdc;
         best_mode = this_mode;
         best_pred_filter = mbmi->interp_filter;
         best_tx_size = mbmi->tx_size;
@@ -757,7 +754,7 @@
     }
     // If the current reference frame is valid and we found a usable mode,
     // we are done.
-    if (best_rd < INT64_MAX)
+    if (best_rdc.rdcost < INT64_MAX)
       break;
   }
 
@@ -790,7 +787,7 @@
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (!x->skip && best_rd > inter_mode_thresh &&
+  if (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
     PREDICTION_MODE this_mode;
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
@@ -812,16 +809,15 @@
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
       mbmi->tx_size = saved_tx_size;
-      rate = args.rate;
-      dist = args.dist;
-      rate += cpi->mbmode_cost[this_mode];
-      rate += intra_cost_penalty;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      this_rdc.rate = args.rate;
+      this_rdc.dist = args.dist;
+      this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
 
-      if (this_rd + intra_mode_cost < best_rd) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
+      if (this_rdc.rdcost + intra_mode_cost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
         mbmi->mode = this_mode;
         mbmi->tx_size = intra_tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
@@ -834,4 +830,6 @@
     if (cpi->sf.reuse_inter_pred_sby)
       pd->dst = orig_dst;
   }
+
+  *rd_cost = best_rdc;
 }
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 97aeca7..27a319d 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -19,9 +19,7 @@
 
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          const struct TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize,
                          PICK_MODE_CONTEXT *ctx);
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index bec77d7..e148bf9 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -142,8 +142,6 @@
 
   if (speed >= 5) {
     int i;
-
-    sf->partition_search_type = FIXED_PARTITION;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -151,8 +149,7 @@
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-  }
-  if (speed >= 6) {
+    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
   }
 }
@@ -205,7 +202,6 @@
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -217,8 +213,6 @@
   if (speed >= 3) {
     sf->use_square_partition_only = 1;
     sf->disable_filter_search_var_thresh = 100;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->constrain_copy_partition = 1;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
     sf->mv.subpel_iters_per_step = 1;
@@ -275,6 +269,15 @@
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    else
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    sf->partition_search_breakout_rate_thr = 200;
   }
 
   if (speed >= 6) {
@@ -292,9 +295,6 @@
 
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
 
-    // This feature is only enabled when partition search is disabled.
-    sf->reuse_inter_pred_sby = 1;
-
     // Increase mode checking threshold for NEWMV.
     sf->elevate_newmv_thresh = 1000;
 
@@ -342,7 +342,6 @@
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
@@ -362,7 +361,6 @@
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->constrain_copy_partition = 0;
   sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->force_frame_boost = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index cc6c2e5..1712f87 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -93,12 +93,6 @@
 } MOTION_THRESHOLD;
 
 typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
   USE_FULL_RD = 0,
   USE_LARGESTALL,
   USE_TX_8X8
@@ -149,16 +143,12 @@
 
 typedef enum {
   // Search partitions using RD/NONRD criterion
-  SEARCH_PARTITION = 0,
+  SEARCH_PARTITION,
 
   // Always use a fixed size partition
-  FIXED_PARTITION = 1,
+  FIXED_PARTITION,
 
-  // Use a fixed size partition in every 64X64 SB, where the size is
-  // determined based on source variance
-  VAR_BASED_FIXED_PARTITION = 2,
-
-  REFERENCE_PARTITION = 3,
+  REFERENCE_PARTITION,
 
   // Use an arbitrary partitioning scheme based on source variance within
   // a 64X64 SB
@@ -246,15 +236,6 @@
   // level within a frame.
   int allow_skip_recode;
 
-  // This variable allows us to reuse the last frames partition choices
-  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
-  // frame as a starting point in low motion scenes or always use it. If set
-  // we use last partitioning_redo frequency to determine how often to redo
-  // the partitioning from scratch. Adjust_partitioning_from_last_frame
-  // enables us to adjust up or down one partitioning from the last frames
-  // partitioning.
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
-
   // The threshold is to determine how slow the motino is, it is used when
   // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
   MOTION_THRESHOLD lf_motion_threshold;
@@ -268,8 +249,6 @@
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
 
-  // TODO(JBB): remove this as its no longer used.
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -307,12 +286,6 @@
   // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
 
-  // This enables constrained copy partitioning, which, given an input block
-  // size bsize, will copy previous partition for partitions less than bsize,
-  // otherwise bsize partition is used. bsize is currently set to 16x16.
-  // Used for the case where motion is detected in superblock.
-  int constrain_copy_partition;
-
   // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
   // it always, to allow it for only Last frame and Intra, disable it for all
   // inter modes or to enable it always.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1573557..8d3ca0d 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -85,7 +85,7 @@
                                           oxcf->best_allowed_q) / 2;
       lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
                                             oxcf->best_allowed_q) / 2;
-      if (oxcf->ss_play_alternate[layer])
+      if (oxcf->ss_enable_auto_arf[layer])
         lc->alt_ref_idx = alt_ref_idx++;
       else
         lc->alt_ref_idx = -1;
@@ -305,7 +305,7 @@
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
     }
   } else {
-    if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
+    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
       cpi->alt_fb_idx = lc->alt_ref_idx;
       if (!lc->has_alt_frame)
         cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
@@ -317,7 +317,7 @@
         LAYER_CONTEXT *lc_lower =
             &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
 
-        if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] &&
+        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
             lc_lower->alt_ref_source != NULL)
           cpi->alt_fb_idx = lc_lower->alt_ref_idx;
         else if (cpi->svc.spatial_layer_id >= 2)
diff --git a/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/vp9/encoder/x86/vp9_sad_intrin_avx2.c
new file mode 100644
index 0000000..1131930
--- /dev/null
+++ b/vp9/encoder/x86/vp9_sad_intrin_avx2.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "vpx_ports/mem.h"
+
+#define FSAD64_H(h) \
+unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD32_H(h) \
+unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD64 \
+FSAD64_H(64); \
+FSAD64_H(32);
+
+#define FSAD32 \
+FSAD32_H(64); \
+FSAD32_H(32); \
+FSAD32_H(16);
+
+FSAD64;
+FSAD32;
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG32_H(h) \
+unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, \
+              _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG64 \
+FSADAVG64_H(64); \
+FSADAVG64_H(32);
+
+#define FSADAVG32 \
+FSADAVG32_H(64); \
+FSADAVG32_H(32); \
+FSADAVG32_H(16);
+
+FSADAVG64;
+FSADAVG32;
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d0ca524..adae18b 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -202,7 +202,7 @@
     ERROR("kf_min_dist not supported in auto mode, use 0 "
           "or kf_max_dist instead.");
 
-  RANGE_CHECK_BOOL(extra_cfg,  enable_auto_alt_ref);
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
   RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
@@ -412,7 +412,7 @@
 
   oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
-  oxcf->play_alternate         =  extra_cfg->enable_auto_alt_ref;
+  oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
   oxcf->sharpness              =  extra_cfg->sharpness;
 
@@ -445,13 +445,13 @@
     for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
       oxcf->ss_target_bitrate[i] =  1000 * cfg->ss_target_bitrate[i];
 #if CONFIG_SPATIAL_SVC
-      oxcf->ss_play_alternate[i] =  cfg->ss_enable_auto_alt_ref[i];
+      oxcf->ss_enable_auto_arf[i] =  cfg->ss_enable_auto_alt_ref[i];
 #endif
     }
   } else if (oxcf->ss_number_layers == 1) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
 #if CONFIG_SPATIAL_SVC
-    oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
 #endif
   }
 
@@ -493,7 +493,7 @@
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-  printf("play_alternate: %d\n", oxcf->play_alternate);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
   printf("Version: %d\n", oxcf->Version);
   printf("encode_breakout: %d\n", oxcf->encode_breakout);
   printf("error resilient: %d\n", oxcf->error_resilient_mode);
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index ad76722..e72cb00 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -118,6 +118,7 @@
 endif
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
diff --git a/vpxdec.c b/vpxdec.c
index 2afdb71..c4d2a9e 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -276,7 +276,8 @@
     const int plane = planes[i];
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+                ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {