Merge "Speed up rd selection in OBMC experiment" into nextgenv2
diff --git a/configure b/configure
index e7eb152..97366e4 100755
--- a/configure
+++ b/configure
@@ -284,7 +284,6 @@
     ext_partition
     ext_tile
     obmc
-    affine_motion
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
index 07b0dda..122a8e5 100644
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc
@@ -5,6 +5,7 @@
 #include "vp10/common/filter.h"
 #include "vp10/common/vp10_convolve.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
@@ -270,4 +271,132 @@
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define CONVOLVE_SPEED_TEST 0
+#if CONVOLVE_SPEED_TEST
+#define highbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(VP10ConvolveTest, func##_speed_##block_size##_##frame_size) {         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
+    INTERP_FILTER interp_filter = EIGHTTAP;                                  \
+    InterpFilterParams filter_params =                                       \
+        vp10_get_interp_filter_params(interp_filter);                        \
+    ptrdiff_t filter_size = filter_params.tap;                               \
+    int filter_center = filter_size / 2 - 1;                                 \
+    DECLARE_ALIGNED(16, uint16_t,                                            \
+                    src[(frame_size + 7) * (frame_size + 7)]) = {0};         \
+    int src_stride = frame_size + 7;                                         \
+    DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = {0};       \
+    int dst_stride = frame_size;                                             \
+    int x_step_q4 = 16;                                                      \
+    int y_step_q4 = 16;                                                      \
+    int subpel_x_q4 = 8;                                                     \
+    int subpel_y_q4 = 6;                                                     \
+    int bd = 10;                                                             \
+                                                                             \
+    int w = block_size;                                                      \
+    int h = block_size;                                                      \
+                                                                             \
+    const int16_t* filter_x =                                                \
+        vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
+    const int16_t* filter_y =                                                \
+        vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
+                                                                             \
+    for (int i = 0; i < src_stride * src_stride; i++) {                      \
+      src[i] = rnd.Rand16() % (1 << bd);                                     \
+    }                                                                        \
+                                                                             \
+    int offset = filter_center * src_stride + filter_center;                 \
+    int row_offset = 0;                                                      \
+    int col_offset = 0;                                                      \
+    for (int i = 0; i < 100000; i++) {                                       \
+      int src_total_offset = offset + col_offset * src_stride + row_offset;  \
+      int dst_total_offset = col_offset * dst_stride + row_offset;           \
+      func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
+           CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
+           x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+      if (offset + w + w < frame_size) {                                     \
+        row_offset += w;                                                     \
+      } else {                                                               \
+        row_offset = 0;                                                      \
+        col_offset += h;                                                     \
+      }                                                                      \
+      if (col_offset + h >= frame_size) {                                    \
+        col_offset = 0;                                                      \
+      }                                                                      \
+    }                                                                        \
+  }
+
+#define lowbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(VP10ConvolveTest, func##_speed_l_##block_size##_##frame_size) {      \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
+    INTERP_FILTER interp_filter = EIGHTTAP;                                 \
+    InterpFilterParams filter_params =                                      \
+        vp10_get_interp_filter_params(interp_filter);                       \
+    ptrdiff_t filter_size = filter_params.tap;                              \
+    int filter_center = filter_size / 2 - 1;                                \
+    DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
+    int src_stride = frame_size + 7;                                        \
+    DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
+    int dst_stride = frame_size;                                            \
+    int x_step_q4 = 16;                                                     \
+    int y_step_q4 = 16;                                                     \
+    int subpel_x_q4 = 8;                                                    \
+    int subpel_y_q4 = 6;                                                    \
+    int bd = 8;                                                             \
+                                                                            \
+    int w = block_size;                                                     \
+    int h = block_size;                                                     \
+                                                                            \
+    const int16_t* filter_x =                                               \
+        vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);          \
+    const int16_t* filter_y =                                               \
+        vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);          \
+                                                                            \
+    for (int i = 0; i < src_stride * src_stride; i++) {                     \
+      src[i] = rnd.Rand16() % (1 << bd);                                    \
+    }                                                                       \
+                                                                            \
+    int offset = filter_center * src_stride + filter_center;                \
+    int row_offset = 0;                                                     \
+    int col_offset = 0;                                                     \
+    for (int i = 0; i < 100000; i++) {                                      \
+      func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
+           filter_y, y_step_q4, w, h);                                      \
+      if (offset + w + w < frame_size) {                                    \
+        row_offset += w;                                                    \
+      } else {                                                              \
+        row_offset = 0;                                                     \
+        col_offset += h;                                                    \
+      }                                                                     \
+      if (col_offset + h >= frame_size) {                                   \
+        col_offset = 0;                                                     \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// This experiment shows that when frame size is 64x64
+// vpx_highbd_convolve8_sse2 and vpx_convolve8_sse2's speed are similar.
+// However when frame size becomes 1024x1024
+// vpx_highbd_convolve8_sse2 is around 50% slower than vpx_convolve8_sse2
+// we think the bottleneck is from memory IO
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 64);
+
+lowbd_convolve_speed(vpx_convolve8_sse2, 8, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 16, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 32, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 64, 64);
+
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 1024);
+
+lowbd_convolve_speed(vpx_convolve8_sse2, 8, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 16, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 32, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 64, 1024);
+#endif  // CONVOLVE_SPEED_TEST
 }  // namespace
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index b939424..d48679e 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -383,7 +383,7 @@
 };
 
 
-const vpx_prob default_uv_palette_mode_prob[2] = {
+const vpx_prob vp10_default_palette_uv_mode_prob[2] = {
     253, 229
 };
 
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index ba36ddb..2443d60 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -176,6 +176,7 @@
                                         [INTRA_MODES - 1];
 extern const vpx_prob
 vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS];
+extern const vpx_prob vp10_default_palette_uv_mode_prob[2];
 extern const vpx_prob
 vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
 extern const vpx_prob
diff --git a/vp10/common/restoration.h b/vp10/common/restoration.h
index 43b140e..980fe72 100644
--- a/vp10/common/restoration.h
+++ b/vp10/common/restoration.h
@@ -42,6 +42,9 @@
 #define WIENER_FILT_TAP1_BITS     4
 #define WIENER_FILT_TAP2_BITS     5
 
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
 #define WIENER_FILT_TAP0_MAXV \
   (WIENER_FILT_TAP0_MINV -1 + (1 << WIENER_FILT_TAP0_BITS))
 #define WIENER_FILT_TAP1_MAXV \
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index e26b40d..6dc5604 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -19,6 +19,7 @@
   7, 14, 11, 15,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
   0, 4, 8, 12,
   1, 5, 9, 13,
@@ -32,6 +33,7 @@
   8, 9, 10, 11,
   12, 13, 14, 15,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0,  4,  8,  1,
@@ -58,6 +60,7 @@
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
   0,   8,  16,  24,  32,  40,  48,  56,
   1,   9,  17,  25,  33,  41,  49,  57,
@@ -79,6 +82,7 @@
   48,  49,  50,  51,  52,  53,  54,  55,
   56,  57,  58,  59,  60,  61,  62,  63,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
@@ -123,6 +127,7 @@
   255,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
   0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
   1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -169,6 +174,7 @@
   240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
   253, 254, 255,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
@@ -213,6 +219,7 @@
   255,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
   0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
   512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960,
@@ -407,6 +414,7 @@
     1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014,
     1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
@@ -789,6 +797,7 @@
   3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1, 1, 2, 5, 6, 9, 10, 13,
@@ -800,6 +809,7 @@
   0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5, 8, 6, 9, 7, 10, 8,
   8, 9, 12, 10, 13, 11, 14, 0, 0,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
@@ -824,6 +834,7 @@
   47, 47, 55, 55, 0, 0,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1, 8, 9,
@@ -845,6 +856,7 @@
   41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56,
   50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
@@ -868,6 +880,7 @@
   46, 54, 61, 47, 54, 55, 62, 0, 0,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96,
@@ -975,6 +988,7 @@
   253, 239, 254,
   0, 0,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
@@ -1087,6 +1101,7 @@
   238, 239, 254, 0, 0,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192,
@@ -1474,6 +1489,7 @@
   987, 1018, 988, 1019, 989, 1020, 990, 1021, 991, 1022,
   0, 0,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
@@ -2143,6 +2159,7 @@
   0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_4x4[16]) = {
   0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 };
@@ -2150,6 +2167,7 @@
 DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_4x4[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_4x4[16]) = {
   0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
@@ -2159,6 +2177,7 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x8[64]) = {
   0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10,
   18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20,
@@ -2172,6 +2191,7 @@
   37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
   54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_8x8[64]) = {
   0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
@@ -2194,6 +2214,7 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_16x16[256]) = {
   0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
   1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
@@ -2240,6 +2261,7 @@
   240, 241, 242, 243, 244, 245, 246, 247,
   248, 249, 250, 251, 252, 253, 254, 255,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_16x16[256]) = {
   0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
@@ -2300,6 +2322,7 @@
   249, 253, 255,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_32x32[1024]) = {
   0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
   448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832,
@@ -2497,6 +2520,7 @@
   1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019,
   1020, 1021, 1022, 1023,
 };
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
   0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 320e66e..ce6317c 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1978,6 +1978,10 @@
     }
     if (!is_inter_block(mbmi)) {
       int plane;
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane])
+          vp10_decode_palette_tokens(xd, plane, r);
+      }
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
         const TX_SIZE tx_size =
@@ -1994,9 +1998,6 @@
             (xd->mb_to_bottom_edge >= 0 ?
              0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-        if (plane <= 1 && mbmi->palette_mode_info.palette_size[plane])
-          vp10_decode_palette_tokens(xd, plane, r);
-
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
             predict_and_reconstruct_intra_block(xd,
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 4cd6d1d..eb336be 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -419,30 +419,48 @@
                                    vpx_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi  = xd->left_mi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  int i, palette_ctx = 0;
+  int i, n, palette_ctx = 0;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 
-  if (above_mi)
-    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (vpx_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                   [palette_ctx])) {
-    int n;
-    PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-
-    pmi->palette_size[0] =
+  if (mbmi->mode == DC_PRED) {
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (vpx_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                     [palette_ctx])) {
+      pmi->palette_size[0] =
         vpx_read_tree(r, vp10_palette_size_tree,
                       vp10_default_palette_y_size_prob[bsize - BLOCK_8X8]) + 2;
-    n = pmi->palette_size[0];
+      n = pmi->palette_size[0];
+      for (i = 0; i < n; ++i)
+        pmi->palette_colors[i] = vpx_read_literal(r, cm->bit_depth);
 
-    for (i = 0; i < n; ++i)
-      pmi->palette_colors[i] = vpx_read_literal(r, cm->bit_depth);
+      xd->plane[0].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[0].color_index_map[0] < n);
+    }
+  }
 
-    xd->plane[0].color_index_map[0] = read_uniform(r, n);
-    assert(xd->plane[0].color_index_map[0] < n);
+  if (mbmi->uv_mode == DC_PRED) {
+    if (vpx_read(r,
+                 vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0])) {
+      pmi->palette_size[1] =
+          vpx_read_tree(r, vp10_palette_size_tree,
+                        vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8])
+                        + 2;
+      n = pmi->palette_size[1];
+      for (i = 0; i < n; ++i) {
+        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
+            vpx_read_literal(r, cm->bit_depth);
+        pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+            vpx_read_literal(r, cm->bit_depth);
+      }
+      xd->plane[1].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[1].color_index_map[0] < n);
+    }
   }
 }
 
@@ -456,7 +474,8 @@
 #if !ALLOW_FILTER_INTRA_MODES
   return;
 #endif
-  if (mbmi->mode == DC_PRED) {
+  if (mbmi->mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[0] == 0) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
         vpx_read(r, cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
@@ -466,7 +485,8 @@
     if (counts)
       ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
   }
-  if (mbmi->uv_mode == DC_PRED) {
+  if (mbmi->uv_mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[1] == 0) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
         vpx_read(r, cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
@@ -555,8 +575,7 @@
 
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
-      mbmi->mode == DC_PRED)
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
     read_palette_mode_info(cm, xd, r);
 
   if (!FIXED_TX_TYPE) {
@@ -868,9 +887,10 @@
     mbmi->angle_delta[1] =
         read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
 #endif  // CONFIG_EXT_INTRA
-
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    read_palette_mode_info(cm, xd, r);
 #if CONFIG_EXT_INTRA
   mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
@@ -1151,6 +1171,9 @@
   int16_t mode_ctx = 0;
   MV_REFERENCE_FRAME ref_frame;
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
 
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index c5dec87..c1153d2 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -429,13 +429,15 @@
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_x);
   int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
   int n = mbmi->palette_mode_info.palette_size[plane != 0];
   int i, j;
-  uint8_t *color_map = xd->plane[plane].color_index_map;
-  const vpx_prob (* prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
+  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
+  const vpx_prob (* const prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
       plane ? vp10_default_palette_uv_color_prob :
           vp10_default_palette_y_color_prob;
 
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 24a9366..1ef2ea5 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -509,13 +509,11 @@
 #endif  // CONFIG_EXT_TX
 
 static void pack_palette_tokens(vpx_writer *w, TOKENEXTRA **tp,
-                                BLOCK_SIZE bsize, int n) {
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+                                int n, int num) {
   int i;
   TOKENEXTRA *p = *tp;
 
-  for (i = 0; i < rows * cols -1; ++i) {
+  for (i = 0; i < num; ++i) {
     vp10_write_token(w, vp10_palette_color_tree[n - 2], p->context_tree,
                      &palette_color_encodings[n - 2][p->token]);
     ++p;
@@ -842,7 +840,8 @@
 #if !ALLOW_FILTER_INTRA_MODES
   return;
 #endif
-  if (mbmi->mode == DC_PRED) {
+  if (mbmi->mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[0] == 0) {
     vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
               cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
@@ -850,7 +849,8 @@
       write_uniform(w, FILTER_INTRA_MODES, mode);
     }
   }
-  if (mbmi->uv_mode == DC_PRED) {
+  if (mbmi->uv_mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[1] == 0) {
     vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[1],
               cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
@@ -881,6 +881,55 @@
   }
 }
 
+static void write_palette_mode_info(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd,
+                                    const MODE_INFO *const mi,
+                                    vpx_writer *w) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  int palette_ctx = 0;
+  int n, i;
+
+  if (mbmi->mode == DC_PRED) {
+    n = pmi->palette_size[0];
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    vpx_write(w, n > 0,
+              vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
+    if (n > 0) {
+      vp10_write_token(w, vp10_palette_size_tree,
+                       vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                       &palette_size_encodings[n - 2]);
+      for (i = 0; i < n; ++i)
+        vpx_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+      write_uniform(w, n, pmi->palette_first_color_idx[0]);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    n = pmi->palette_size[1];
+    vpx_write(w, n > 0,
+              vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0]);
+    if (n > 0) {
+      vp10_write_token(w, vp10_palette_size_tree,
+                       vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+                       &palette_size_encodings[n - 2]);
+      for (i = 0; i < n; ++i) {
+        vpx_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+        vpx_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+      }
+      write_uniform(w, n, pmi->palette_first_color_idx[1]);
+    }
+  }
+}
+
 static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
 #if CONFIG_SUPERTX
                                 int supertx_enabled,
@@ -995,7 +1044,10 @@
         bsize >= BLOCK_8X8)
       write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
                     MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
-
+#endif  // CONFIG_EXT_INTRA
+    if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+      write_palette_mode_info(cm, xd, mi, w);
+#if CONFIG_EXT_INTRA
     if (bsize >= BLOCK_8X8)
       write_ext_intra_mode_info(cm, mbmi, w);
 #endif  // CONFIG_EXT_INTRA
@@ -1263,36 +1315,6 @@
     }
 }
 
-static void write_palette_mode_info(const VP10_COMMON *cm,
-                                    const MACROBLOCKD *xd,
-                                    const MODE_INFO *const mi,
-                                    vpx_writer *w) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-  int palette_ctx = 0;
-  int n, i;
-
-  n = pmi->palette_size[0];
-  if (above_mi)
-    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  vpx_write(w, n > 0,
-            vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
-  if (n > 0) {
-    vp10_write_token(w, vp10_palette_size_tree,
-                     vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
-                     &palette_size_encodings[n - 2]);
-    for (i = 0; i < n; ++i)
-      vpx_write_literal(w, pmi->palette_colors[i],
-                        cm->bit_depth);
-    write_uniform(w, n, pmi->palette_first_color_idx[0]);
-  }
-}
-
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                               MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
@@ -1351,9 +1373,7 @@
     write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
                   MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
 #endif  // CONFIG_EXT_INTRA
-
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
-      mbmi->mode == DC_PRED)
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
     write_palette_mode_info(cm, xd, mi, w);
 
   if (!FIXED_TX_TYPE) {
@@ -1428,11 +1448,17 @@
                         w);
   }
 
-  if (m->mbmi.palette_mode_info.palette_size[0] > 0) {
-    assert(*tok < tok_end);
-    pack_palette_tokens(w, tok, m->mbmi.sb_type,
-                        m->mbmi.palette_mode_info.palette_size[0]);
-    assert(*tok < tok_end);
+  for (plane = 0; plane <= 1; ++plane) {
+    if (m->mbmi.palette_mode_info.palette_size[plane] > 0) {
+      const int rows = (4 * num_4x4_blocks_high_lookup[m->mbmi.sb_type]) >>
+          (xd->plane[plane].subsampling_y);
+      const int cols = (4 * num_4x4_blocks_wide_lookup[m->mbmi.sb_type]) >>
+          (xd->plane[plane].subsampling_x);
+      assert(*tok < tok_end);
+      pack_palette_tokens(w, tok, m->mbmi.palette_mode_info.palette_size[plane],
+                          rows * cols - 1);
+      assert(*tok < tok_end);
+    }
   }
 
 #if CONFIG_SUPERTX
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 7f55295..c5a68a9 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -4432,9 +4432,11 @@
 #if CONFIG_EXT_INTRA
     if (output_enabled && bsize >= BLOCK_8X8) {
       FRAME_COUNTS *counts = td->counts;
-      if (mbmi->mode == DC_PRED)
+      if (mbmi->mode == DC_PRED &&
+          mbmi->palette_mode_info.palette_size[0] == 0)
         ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
-      if (mbmi->uv_mode == DC_PRED)
+      if (mbmi->uv_mode == DC_PRED &&
+          mbmi->palette_mode_info.palette_size[1] == 0)
         ++counts->ext_intra[1][mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
         int p_angle;
@@ -4448,12 +4450,14 @@
 #endif  // CONFIG_EXT_INTRA
 
     if (bsize >= BLOCK_8X8 && output_enabled) {
-      if (mbmi->palette_mode_info.palette_size[0] > 0) {
-        mbmi->palette_mode_info.palette_first_color_idx[0] =
-            xd->plane[0].color_index_map[0];
-        // TODO(huisu): this increases the use of token buffer. Needs stretch
-        // test to verify.
-        vp10_tokenize_palette_sb(td, bsize, 0, t);
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          mbmi->palette_mode_info.palette_first_color_idx[plane] =
+              xd->plane[plane].color_index_map[0];
+          // TODO(huisu): this increases the use of token buffer. Needs stretch
+          // test to verify.
+          vp10_tokenize_palette_sb(td, bsize, plane, t);
+        }
       }
     }
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 34dd8d5..fc65e72 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -358,9 +358,7 @@
 
 static void dealloc_compressor_data(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
-#if CONFIG_REF_MV
   int i;
-#endif
 
   vpx_free(cpi->mbmi_ext_base);
   cpi->mbmi_ext_base = NULL;
@@ -413,14 +411,9 @@
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
-#if CONFIG_AFFINE_MOTION
-  {
-    // Free up-sampled reference buffers.
-    int i;
-    for (i = 0; i < MAX_REF_FRAMES; i++)
-      vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
-  }
-#endif
+  // Free up-sampled reference buffers.
+  for (i = 0; i < MAX_REF_FRAMES; i++)
+    vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
 
   vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
@@ -756,26 +749,6 @@
                                NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
-
-#if CONFIG_AFFINE_MOTION
-  {
-    // Allocate up-sampled reference buffers.
-    int i;
-
-    for (i = 0; i < MAX_REF_FRAMES; i++)
-      if (vpx_realloc_frame_buffer(&cpi->upsampled_ref_bufs[i].buf,
-                                   (cm->width << 3), (cm->height << 3),
-                                   cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   (VP9_ENC_BORDER_IN_PIXELS << 3),
-                                   cm->byte_alignment,
-                                   NULL, NULL, NULL))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-            "Failed to allocate up-sampled reference frame buffer");
-  }
-#endif
 }
 
 
@@ -2069,6 +2042,14 @@
   } while (++i <= MV_MAX);
 }
 
+static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    cpi->upsampled_ref_bufs[i].ref_count = 0;
+    cpi->upsampled_ref_idx[i] = INVALID_IDX;
+  }
+}
 
 VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
                                 BufferPool *const pool) {
@@ -2267,6 +2248,8 @@
     vp10_init_second_pass(cpi);
   }
 
+  init_upsampled_ref_frame_bufs(cpi);
+
   vp10_set_speed_features_framesize_independent(cpi);
   vp10_set_speed_features_framesize_dependent(cpi);
 
@@ -2929,7 +2912,6 @@
   return force_recode;
 }
 
-#if CONFIG_AFFINE_MOTION
 static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
   int i;
 
@@ -2941,50 +2923,59 @@
   return INVALID_IDX;
 }
 
-// Up-sample reference frames.
-static INLINE int upsample_ref_frame(RefCntBuffer *bufs,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                     EncRefCntBuffer *ubufs, int new_idx,
-                                     int bit_depth) {
-#else
-                                     EncRefCntBuffer *ubufs, int new_idx) {
-#endif
+// Up-sample 1 reference frame.
+static INLINE int upsample_ref_frame(VP10_COMP *cpi,
+                                     const YV12_BUFFER_CONFIG *const ref) {
+  VP10_COMMON * const cm = &cpi->common;
+  EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
   int new_uidx = get_free_upsampled_ref_buf(ubufs);
 
   if (new_uidx == INVALID_IDX) {
     return INVALID_IDX;
   } else {
-    const YV12_BUFFER_CONFIG *const ref = &bufs[new_idx].buf;
     YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
 
+    // Can allocate buffer for Y plane only.
+    if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
+      if (vpx_realloc_frame_buffer(upsampled_ref,
+                                   (cm->width << 3), (cm->height << 3),
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                   cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate up-sampled frame buffer");
+
     // Currently, only Y plane is up-sampled, U, V are not used.
 #if CONFIG_VP9_HIGHBITDEPTH
-    scale_and_extend_frame(ref, upsampled_ref, 1, bit_depth);
+    scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
 #else
     scale_and_extend_frame(ref, upsampled_ref, 1);
 #endif
     return new_uidx;
   }
 }
-#endif
 
 void vp10_update_reference_frames(VP10_COMP *cpi) {
   VP10_COMMON * const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+  int new_uidx = 0;
+
 #if CONFIG_EXT_REFS
   int ref_frame;
 #endif  // CONFIG_EXT_REFS
 
-#if CONFIG_AFFINE_MOTION
-  // Always up-sample the current encoded frame.
-#if CONFIG_VP9_HIGHBITDEPTH
-  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
-                                    cm->new_fb_idx, (int)cm->bit_depth);
-#else
-  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
-                                    cm->new_fb_idx);
-#endif
-#endif
+  if (use_upsampled_ref) {
+    // Up-sample the current encoded frame.
+    RefCntBuffer *bufs = pool->frame_bufs;
+    const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+
+    new_uidx = upsample_ref_frame(cpi, ref);
+  }
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
@@ -2994,12 +2985,12 @@
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
-#if CONFIG_AFFINE_MOTION
-    uref_cnt_fb(cpi->upsampled_ref_bufs,
-                &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
-    uref_cnt_fb(cpi->upsampled_ref_bufs,
-                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-#endif
+    if (use_upsampled_ref) {
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+    }
   } else if (vp10_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
@@ -3013,10 +3004,10 @@
 
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-#if CONFIG_AFFINE_MOTION
-    uref_cnt_fb(cpi->upsampled_ref_bufs,
-                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-#endif
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
@@ -3030,10 +3021,10 @@
 
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
-#if CONFIG_AFFINE_MOTION
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-#endif
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
@@ -3042,10 +3033,10 @@
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-#if CONFIG_AFFINE_MOTION
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
-#endif
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+
       if (!cpi->rc.is_src_frame_alt_ref)
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
@@ -3080,10 +3071,10 @@
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
-#if CONFIG_AFFINE_MOTION
-    uref_cnt_fb(cpi->upsampled_ref_bufs,
-                &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
-#endif
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+
     if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
@@ -3249,8 +3240,9 @@
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if CONFIG_AFFINE_MOTION
-        {
+        if (cpi->sf.use_upsampled_references && (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height)) {
           const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
           EncRefCntBuffer *ubuf =
               &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
@@ -3267,15 +3259,12 @@
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate up-sampled frame buffer");
 #if CONFIG_VP9_HIGHBITDEPTH
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE,
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
                                  (int)cm->bit_depth);
 #else
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE);
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
 #endif
-          cpi->scaled_ref_idx[ref_frame - LAST_FRAME] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
         }
-#endif
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -3610,9 +3599,28 @@
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
+static void reset_use_upsampled_references(VP10_COMP *cpi) {
+  MV_REFERENCE_FRAME ref_frame;
+
+  // reset up-sampled reference buffer structure.
+  init_upsampled_ref_frame_bufs(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                               ref_frame);
+    int new_uidx = upsample_ref_frame(cpi, ref);
+
+    // Update the up-sampled reference index.
+    cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] =
+        new_uidx;
+    cpi->upsampled_ref_bufs[new_uidx].ref_count++;
+  }
+}
+
 static void encode_without_recode_loop(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   vpx_clear_system_state();
 
@@ -3647,6 +3655,12 @@
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+  // cpi->sf.use_upsampled_references can be different from frame to frame.
+  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+  // The reference frames for this frame have to be up-sampled before encoding.
+  if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+    reset_use_upsampled_references(cpi);
+
   vp10_set_quantizer(cm, q);
   vp10_set_variance_partition_thresholds(cpi, q);
 
@@ -3694,9 +3708,16 @@
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   set_size_independent_vars(cpi);
 
+  // cpi->sf.use_upsampled_references can be different from frame to frame.
+  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+  // The reference frames for this frame have to be up-sampled before encoding.
+  if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+    reset_use_upsampled_references(cpi);
+
   do {
     vpx_clear_system_state();
 
@@ -4355,17 +4376,6 @@
   }
 }
 
-#if CONFIG_AFFINE_MOTION
-static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) {
-  int i;
-
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    cpi->upsampled_ref_bufs[i].ref_count = 0;
-    cpi->upsampled_ref_idx[i] = INVALID_IDX;
-  }
-}
-#endif
-
 static void check_initial_width(VP10_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 int use_highbitdepth,
@@ -4388,9 +4398,7 @@
     alloc_raw_frame_buffers(cpi);
     init_ref_frame_bufs(cm);
     alloc_util_frame_buffers(cpi);
-#if CONFIG_AFFINE_MOTION
-    init_upsampled_ref_frame_bufs(cpi);
-#endif
+
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index b2c242c..afe3292 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -286,12 +286,10 @@
   double worst;
 } ImageStat;
 
-#if CONFIG_AFFINE_MOTION
 typedef struct {
   int ref_count;
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
-#endif
 
 typedef struct VP10_COMP {
   QUANTS quants;
@@ -311,11 +309,9 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-#if CONFIG_AFFINE_MOTION
   // Up-sampled reference buffers
   EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES];
   int upsampled_ref_idx[MAX_REF_FRAMES];
-#endif
 
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
@@ -702,7 +698,6 @@
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
-#if CONFIG_AFFINE_MOTION
 // Update up-sampled reference frame index.
 static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
                                int new_uidx) {
@@ -714,7 +709,6 @@
   *uidx = new_uidx;
   ubufs[new_uidx].ref_count++;
 }
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c
index 32ff0fa..5e66ce5 100644
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c
@@ -64,11 +64,7 @@
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list),
         NULL, NULL,
-#if CONFIG_AFFINE_MOTION
         &distortion, &sse, NULL, 0, 0, 0);
-#else
-        &distortion, &sse, NULL, 0, 0);
-#endif
   }
 
 #if CONFIG_EXT_INTER
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 2c93976..1f147d7 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -210,7 +210,6 @@
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
-#if CONFIG_AFFINE_MOTION
 static INLINE const uint8_t *upre(const uint8_t *buf, int stride,
                                   int r, int c) {
   return &buf[(r) * stride + (c)];
@@ -232,7 +231,6 @@
   } else {                                                             \
     v = INT_MAX;                                                       \
   }
-#endif
 
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
@@ -438,11 +436,7 @@
     int *distortion,
     unsigned int *sse1,
     const uint8_t *second_pred,
-#if CONFIG_AFFINE_MOTION
     int w, int h, int use_upsampled_ref) {
-#else
-    int w, int h) {
-#endif
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
@@ -455,9 +449,7 @@
   (void) allow_hp;
   (void) forced_stop;
   (void) hstep;
-#if CONFIG_AFFINE_MOTION
   (void) use_upsampled_ref;
-#endif
 
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -524,16 +516,10 @@
                                              int *distortion,
                                              unsigned int *sse1,
                                              const uint8_t *second_pred,
-#if CONFIG_AFFINE_MOTION
                                              int w, int h,
                                              int use_upsampled_ref) {
-#else
-                                             int w, int h) {
-#endif
   SETUP_SUBPEL_SEARCH;
-#if CONFIG_AFFINE_MOTION
   (void) use_upsampled_ref;
-#endif
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
@@ -607,15 +593,9 @@
                                         int *distortion,
                                         unsigned int *sse1,
                                         const uint8_t *second_pred,
-#if CONFIG_AFFINE_MOTION
                                         int w, int h, int use_upsampled_ref) {
-#else
-                                        int w, int h) {
-#endif
   SETUP_SUBPEL_SEARCH;
-#if CONFIG_AFFINE_MOTION
   (void) use_upsampled_ref;
-#endif
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
@@ -705,9 +685,8 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-
-#if CONFIG_AFFINE_MOTION
 #if CONFIG_VP9_HIGHBITDEPTH
+// TODO(yunqing): Optimize the following 2 functions.
 static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred,
                                            const uint8_t *pred8,
                                            int width, int height,
@@ -798,7 +777,6 @@
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
-#endif
 
 int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
@@ -812,11 +790,7 @@
                                  int *distortion,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
-#if CONFIG_AFFINE_MOTION
                                  int w, int h, int use_upsampled_ref) {
-#else
-                                 int w, int h) {
-#endif
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -852,7 +826,6 @@
   bestmv->row *= 8;
   bestmv->col *= 8;
 
-#if CONFIG_AFFINE_MOTION
   // use_upsampled_ref can be 0 or 1
   if (use_upsampled_ref)
     besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit,
@@ -860,7 +833,6 @@
                                            second_pred, w, h, (offset << 3),
                                            mvjcost, mvcost, sse1, distortion);
   else
-#endif
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                  z, src_stride, y, y_stride, second_pred,
                                  w, h, offset, mvjcost, mvcost,
@@ -876,7 +848,6 @@
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = {tr, tc};
 
-#if CONFIG_AFFINE_MOTION
         if (use_upsampled_ref) {
           const uint8_t *const pre_address = y + tr * y_stride + tc;
 
@@ -884,7 +855,6 @@
                                          pre_address, y_stride, second_pred,
                                          w, h, &sse);
         } else {
-#endif
           const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
               (tc >> 3);
           if (second_pred == NULL)
@@ -893,9 +863,7 @@
           else
             thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                                 src_address, src_stride, &sse, second_pred);
-#if CONFIG_AFFINE_MOTION
         }
-#endif
 
         cost_array[idx] = thismse +
             mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
@@ -920,7 +888,6 @@
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       MV this_mv = {tr, tc};
 
-#if CONFIG_AFFINE_MOTION
       if (use_upsampled_ref) {
         const uint8_t *const pre_address = y + tr * y_stride + tc;
 
@@ -928,7 +895,6 @@
                                        pre_address, y_stride, second_pred,
                                        w, h, &sse);
       } else {
-#endif
         const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
 
         if (second_pred == NULL)
@@ -937,9 +903,7 @@
         else
           thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, src_stride, &sse, second_pred);
-#if CONFIG_AFFINE_MOTION
       }
-#endif
 
       cost_array[4] = thismse +
           mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
@@ -963,15 +927,11 @@
     }
 
     if (iters_per_step > 1 && best_idx != -1) {
-#if CONFIG_AFFINE_MOTION
       if (use_upsampled_ref) {
         SECOND_LEVEL_CHECKS_BEST(1);
       } else {
-#endif
         SECOND_LEVEL_CHECKS_BEST(0);
-#if CONFIG_AFFINE_MOTION
       }
-#endif
     }
 
     tr = br;
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index a430c76..f99cd8b 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -116,11 +116,7 @@
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
-#if CONFIG_AFFINE_MOTION
     int w, int h, int use_upsampled_ref);
-#else
-    int w, int h);
-#endif
 
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
index 522e185..d413935 100644
--- a/vp10/encoder/palette.c
+++ b/vp10/encoder/palette.c
@@ -93,7 +93,7 @@
                  uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
   int i = 0;
   double pre_dist, this_dist;
-  double pre_centroids[PALETTE_MAX_SIZE];
+  double pre_centroids[2 * PALETTE_MAX_SIZE];
 
   vp10_calc_indices(data, centroids, indices, n, k, dim);
   pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
diff --git a/vp10/encoder/pickrst.c b/vp10/encoder/pickrst.c
index 9982836..13f955d 100644
--- a/vp10/encoder/pickrst.c
+++ b/vp10/encoder/pickrst.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <float.h>
 #include <limits.h>
 #include <math.h>
 
@@ -27,12 +28,12 @@
 #include "vp10/encoder/picklpf.h"
 #include "vp10/encoder/pickrst.h"
 
-static int try_restoration_frame(const YV12_BUFFER_CONFIG *sd,
-                                 VP10_COMP *const cpi,
-                                 RestorationInfo *rsi,
-                                 int partial_frame) {
+static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *sd,
+                                     VP10_COMP *const cpi,
+                                     RestorationInfo *rsi,
+                                     int partial_frame) {
   VP10_COMMON *const cm = &cpi->common;
-  int filt_err;
+  int64_t filt_err;
   vp10_loop_restoration_frame(cm->frame_to_show, cm,
                               rsi, 1, partial_frame);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -55,7 +56,8 @@
                                   int filter_level, int partial_frame,
                                   double *best_cost_ret) {
   VP10_COMMON *const cm = &cpi->common;
-  int i, restoration_best, err;
+  int i, restoration_best;
+  int64_t err;
   double best_cost;
   double cost;
   const int restoration_level_bits = vp10_restoration_level_bits(&cpi->common);
@@ -400,8 +402,8 @@
   }
 }
 
-static void wiener_decompose_sep_sym(double *M, double *H,
-                                     double *a, double *b) {
+static int wiener_decompose_sep_sym(double *M, double *H,
+                                    double *a, double *b) {
   static const double init_filt[RESTORATION_WIN] = {
     0.035623, -0.127154,  0.211436,  0.760190,  0.211436, -0.127154,  0.035623,
   };
@@ -424,6 +426,7 @@
     update_b_sep_sym(Mc, Hc, a, b);
     iter++;
   }
+  return 1;
 }
 
 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
@@ -448,7 +451,8 @@
                                 double *best_cost_ret) {
   VP10_COMMON *const cm = &cpi->common;
   RestorationInfo rsi;
-  int err, bits;
+  int64_t err;
+  int bits;
   double cost_wiener, cost_norestore;
   MACROBLOCK *x = &cpi->td.mb;
   double M[RESTORATION_WIN2];
@@ -485,7 +489,10 @@
     compute_stats(dgd->y_buffer, src->y_buffer, width, height,
                   dgd_stride, src_stride, M, H);
 
-  wiener_decompose_sep_sym(M, H, vfilterd, hfilterd);
+  if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+    *best_cost_ret = DBL_MAX;
+    return 0;
+  }
   quantize_sym_filter(vfilterd, vfilter);
   quantize_sym_filter(hfilterd, hfilter);
 
@@ -493,7 +500,7 @@
   memcpy(rsi.vfilter, vfilter, sizeof(rsi.vfilter));
   memcpy(rsi.hfilter, hfilter, sizeof(rsi.hfilter));
   err = try_restoration_frame(src, cpi, &rsi, partial_frame);
-  bits = 22;
+  bits = WIENER_FILT_BITS;
   cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits << 2), err);
 
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -511,10 +518,10 @@
     const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi, LPF_PICK_METHOD method) {
   VP10_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
-  int wiener_success;
-  double cost_bilateral = 1e12;
-  double cost_wiener = 1e12;
-  double cost_norestore = 1e12;
+  int wiener_success = 0;
+  double cost_bilateral = DBL_MAX;
+  double cost_wiener = DBL_MAX;
+  double cost_norestore = DBL_MAX;
 
   lf->sharpness_level =
       cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
@@ -577,8 +584,6 @@
     wiener_success = search_wiener_filter(
         sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE,
         cm->rst_info.vfilter, cm->rst_info.hfilter, &cost_wiener);
-    // printf("Costs %g %g (%d) %g\n",
-    //        cost_norestore, cost_bilateral, lf->filter_level, cost_wiener);
     if (cost_bilateral < cost_wiener) {
       lf->filter_level = blf_filter_level;
       if (cm->rst_info.restoration_level != -1)
@@ -591,5 +596,8 @@
       else
         cm->rst_info.restoration_type = RESTORE_NONE;
     }
+    // printf("[%d] Costs %g %g (%d) %g (%d)\n", cm->rst_info.restoration_type,
+    //        cost_norestore, cost_bilateral, lf->filter_level, cost_wiener,
+    //        wiener_success);
   }
 }
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index cd4c209..6543b33 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1691,21 +1691,23 @@
   return 0;
 }
 
-void rd_pick_palette_intra_sby(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                               int palette_ctx, int dc_mode_cost,
-                               PALETTE_MODE_INFO *palette_mode_info,
-                               uint8_t *best_palette_color_map,
-                               TX_SIZE *best_tx, PREDICTION_MODE *mode_selected,
-                               int64_t *best_rd) {
+static int rd_pick_palette_intra_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                     BLOCK_SIZE bsize,
+                                     int palette_ctx, int dc_mode_cost,
+                                     PALETTE_MODE_INFO *palette_mode_info,
+                                     uint8_t *best_palette_color_map,
+                                     TX_SIZE *best_tx,
+                                     PREDICTION_MODE *mode_selected,
+                                     int64_t *best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-  int this_rate, this_rate_tokenonly, s;
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int this_rate, this_rate_tokenonly, s, colors, n;
+  int rate_overhead = 0;
   int64_t this_distortion, this_rd;
-  int colors, n;
-  int src_stride = x->plane[0].src.stride;
-  uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cpi->common.use_highbitdepth)
@@ -1715,19 +1717,23 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     colors = vp10_count_colors(src, src_stride, rows, cols);
   palette_mode_info->palette_size[0] = 0;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+#endif  // CONFIG_EXT_INTRA
 
   if (colors > 1 && colors <= 64 && cpi->common.allow_screen_content_tools) {
     int r, c, i, j, k;
-    int max_itr = 50;
+    const int max_itr = 50;
     int color_ctx, color_idx = 0;
     int color_order[PALETTE_MAX_SIZE];
-    double *data = x->palette_buffer->kmeans_data_buf;
-    uint8_t *indices = x->palette_buffer->kmeans_indices_buf;
-    uint8_t *pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
+    double *const data = x->palette_buffer->kmeans_data_buf;
+    uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+    uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
     double centroids[PALETTE_MAX_SIZE];
-    uint8_t *color_map;
+    uint8_t *const color_map = xd->plane[0].color_index_map;
     double lb, ub, val;
-    PALETTE_MODE_INFO *pmi = &mic->mbmi.palette_mode_info;
+    MB_MODE_INFO *const mbmi = &mic->mbmi;
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 #if CONFIG_VP9_HIGHBITDEPTH
     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     if (cpi->common.use_highbitdepth)
@@ -1764,7 +1770,10 @@
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    mic->mbmi.mode = DC_PRED;
+    mbmi->mode = DC_PRED;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+#endif  // CONFIG_EXT_INTRA
 
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
         n >= 2; --n) {
@@ -1805,7 +1814,7 @@
       vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
       for (r = 0; r < rows; ++r)
         for (c = 0; c < cols; ++c)
-          xd->plane[0].color_index_map[r * cols + c] = indices[r * cols + c];
+          color_map[r * cols + c] = indices[r * cols + c];
 
       super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
                       &s, NULL, bsize, *best_rd);
@@ -1814,12 +1823,10 @@
 
       this_rate = this_rate_tokenonly + dc_mode_cost +
           cpi->common.bit_depth * k * vp10_cost_bit(128, 0) +
-          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - 2];
-      this_rate +=
+          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - 2] +
+          write_uniform_cost(k, color_map[0]) +
           vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                         [palette_ctx], 1);
-      color_map = xd->plane[0].color_index_map;
-      this_rate +=  write_uniform_cost(k, xd->plane[0].color_index_map[0]);
+                                                        [palette_ctx], 1);
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
           color_ctx = vp10_get_palette_color_context(color_map, cols, i, j,
@@ -1829,7 +1836,7 @@
               color_idx = r;
               break;
             }
-          assert(color_idx < k);
+          assert(color_idx >= 0 && color_idx < k);
           this_rate +=
               cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
         }
@@ -1838,14 +1845,17 @@
 
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
-        *palette_mode_info = mic->mbmi.palette_mode_info;
-        memcpy(best_palette_color_map, xd->plane[0].color_index_map,
-               rows * cols * sizeof(xd->plane[0].color_index_map[0]));
+        *palette_mode_info = *pmi;
+        memcpy(best_palette_color_map, color_map,
+               rows * cols * sizeof(color_map[0]));
         *mode_selected = DC_PRED;
-        *best_tx = mic->mbmi.tx_size;
+        *best_tx = mbmi->tx_size;
+        rate_overhead = this_rate - this_rate_tokenonly;
       }
     }
   }
+
+  return rate_overhead;
 }
 
 static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
@@ -2247,6 +2257,7 @@
   vp10_zero(ext_intra_mode_info);
   mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 1;
   mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     mbmi->ext_intra_mode_info.ext_intra_mode[0] = mode;
@@ -2584,6 +2595,7 @@
   TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
   PALETTE_MODE_INFO palette_mode_info;
+  PALETTE_MODE_INFO *const pmi = &mic->mbmi.palette_mode_info;
   uint8_t *best_palette_color_map = cpi->common.allow_screen_content_tools ?
       x->palette_buffer->best_palette_color_map : NULL;
   const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -2630,7 +2642,7 @@
 #endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   palette_mode_info.palette_size[0] = 0;
-  mic->mbmi.palette_mode_info.palette_size[0] = 0;
+  pmi->palette_size[0] = 0;
   if (above_mi)
     palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
   if (left_mi)
@@ -2717,7 +2729,7 @@
                               &best_tx, &mode_selected, &best_rd);
 
 #if CONFIG_EXT_INTRA
-  if (!palette_mode_info.palette_size[0] > 0 && ALLOW_FILTER_INTRA_MODES) {
+  if (ALLOW_FILTER_INTRA_MODES) {
     if (rd_pick_ext_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                               skippable, bsize, bmode_costs[DC_PRED],
                               &best_rd)) {
@@ -2733,6 +2745,7 @@
   if (ext_intra_mode_info.use_ext_intra_mode[0]) {
     mic->mbmi.ext_intra_mode_info.ext_intra_mode[0] =
         ext_intra_mode_info.ext_intra_mode[0];
+    palette_mode_info.palette_size[0] = 0;
   }
 #endif  // CONFIG_EXT_INTRA
 
@@ -2743,11 +2756,9 @@
   mic->mbmi.intra_filter = best_filter;
 #endif  // CONFIG_EXT_INTRA
   mic->mbmi.tx_type = best_tx_type;
-  mic->mbmi.palette_mode_info.palette_size[0] =
-      palette_mode_info.palette_size[0];
+  pmi->palette_size[0] = palette_mode_info.palette_size[0];
   if (palette_mode_info.palette_size[0] > 0) {
-    memcpy(mic->mbmi.palette_mode_info.palette_colors,
-           palette_mode_info.palette_colors,
+    memcpy(pmi->palette_colors, palette_mode_info.palette_colors,
            PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
     memcpy(xd->plane[0].color_index_map, best_palette_color_map,
            rows * cols * sizeof(best_palette_color_map[0]));
@@ -3498,6 +3509,183 @@
   return is_cost_valid;
 }
 
+static void rd_pick_palette_intra_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx, int dc_mode_cost,
+                                       PALETTE_MODE_INFO *palette_mode_info,
+                                       uint8_t *best_palette_color_map,
+                                       PREDICTION_MODE *mode_selected,
+                                       int64_t *best_rd, int *rate,
+                                       int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) {
+    colors_u = vp10_count_colors_highbd(src_u, src_stride, rows, cols,
+                                        cpi->common.bit_depth);
+    colors_v = vp10_count_colors_highbd(src_v, src_stride, rows, cols,
+                                        cpi->common.bit_depth);
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    colors_u = vp10_count_colors(src_u, src_stride, rows, cols);
+    colors_v = vp10_count_colors(src_v, src_stride, rows, cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int color_ctx, color_idx = 0;
+    int color_order[PALETTE_MAX_SIZE];
+    int64_t this_sse;
+    double lb_u, ub_u, val_u;
+    double lb_v, ub_v, val_v;
+    double *const data = x->palette_buffer->kmeans_data_buf;
+    uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+    uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
+    double centroids[2 * PALETTE_MAX_SIZE];
+    uint8_t *const color_map = xd->plane[1].color_index_map;
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (cpi->common.use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    mbmi->uv_mode = DC_PRED;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cpi->common.use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2 ] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2 ] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+        n >= 2; --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] =
+            lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;;
+      }
+      r = vp10_k_means(data, centroids, indices, pre_indices, rows * cols, n,
+                       2, max_itr);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cpi->common.use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel_highbd(round(centroids[j * 2 + i - 1]),
+                                  cpi->common.bit_depth);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel(round(centroids[j * 2 + i - 1]));
+        }
+      }
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          color_map[r * cols + c] = indices[r * cols + c];
+
+      super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                       &this_distortion, &s, &this_sse, bsize, *best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+      this_rate = this_rate_tokenonly + dc_mode_cost +
+          2 * cpi->common.bit_depth * n * vp10_cost_bit(128, 0) +
+          cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - 2] +
+          write_uniform_cost(n, color_map[0]) +
+          vp10_cost_bit(vp10_default_palette_uv_mode_prob
+                        [pmi->palette_size[0] > 0], 1);
+
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                     color_order);
+          for (r = 0; r < n; ++r)
+            if (color_map[i * cols + j] == color_order[r]) {
+              color_idx = r;
+              break;
+            }
+          assert(color_idx >= 0 && color_idx < n);
+          this_rate +=
+              cpi->palette_uv_color_cost[n - 2][color_ctx][color_idx];
+        }
+      }
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *palette_mode_info = *pmi;
+        memcpy(best_palette_color_map, xd->plane[1].color_index_map,
+               rows * cols * sizeof(best_palette_color_map[0]));
+        *mode_selected = DC_PRED;
+        *rate = this_rate;
+        *distortion = this_distortion;
+        *rate_tokenonly = this_rate_tokenonly;
+        *skippable = s;
+        if (!x->select_tx_size)
+          swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
+      }
+    }
+  }
+}
+
 #if CONFIG_EXT_INTRA
 // Return 1 if an ext intra mode is selected; return 0 otherwise.
 static int rd_pick_ext_intra_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
@@ -3516,6 +3704,7 @@
   vp10_zero(ext_intra_mode_info);
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 1;
   mbmi->uv_mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     mbmi->ext_intra_mode_info.ext_intra_mode[1] = mode;
@@ -3670,6 +3859,13 @@
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  PALETTE_MODE_INFO palette_mode_info;
+  PALETTE_MODE_INFO *const pmi = &xd->mi[0]->mbmi.palette_mode_info;
+  uint8_t *best_palette_color_map = NULL;
 #if CONFIG_EXT_INTRA
   int is_directional_mode, rate_overhead, best_angle_delta = 0;
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
@@ -3678,7 +3874,8 @@
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
 #endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
-  xd->mi[0]->mbmi.palette_mode_info.palette_size[1] = 0;
+  palette_mode_info.palette_size[1] = 0;
+  pmi->palette_size[1] = 0;
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
@@ -3705,15 +3902,19 @@
       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
                                       MAX_ANGLE_DELTAS +
                                       mbmi->angle_delta[1]);
-    if (mode == DC_PRED && 0)
+    if (mode == DC_PRED)
       this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 0);
 #else
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
-        cpi->intra_uv_mode_cost[xd->mi[0]->mbmi.mode][mode];
+        cpi->intra_uv_mode_cost[mbmi->mode][mode];
 #endif  // CONFIG_EXT_INTRA
+    if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
+        mode == DC_PRED)
+      this_rate += vp10_cost_bit(vp10_default_palette_uv_mode_prob
+                                 [pmi->palette_size[0] > 0], 0);
 
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
@@ -3732,6 +3933,15 @@
     }
   }
 
+  if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+    best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(cpi, x, ctx,
+                               cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+                               &palette_mode_info, best_palette_color_map,
+                               &mode_selected, &best_rd, rate, rate_tokenonly,
+                               distortion, skippable);
+  }
+
 #if CONFIG_EXT_INTRA
   if (mbmi->sb_type >= BLOCK_8X8 && ALLOW_FILTER_INTRA_MODES) {
     if (rd_pick_ext_intra_sbuv(cpi, x, ctx, rate, rate_tokenonly, distortion,
@@ -3743,12 +3953,23 @@
 
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
       ext_intra_mode_info.use_ext_intra_mode[1];
-  if (ext_intra_mode_info.use_ext_intra_mode[1])
+  if (ext_intra_mode_info.use_ext_intra_mode[1]) {
     mbmi->ext_intra_mode_info.ext_intra_mode[1] =
         ext_intra_mode_info.ext_intra_mode[1];
+    palette_mode_info.palette_size[1] = 0;
+  }
   mbmi->angle_delta[1] = best_angle_delta;
 #endif  // CONFIG_EXT_INTRA
   mbmi->uv_mode = mode_selected;
+  pmi->palette_size[1] = palette_mode_info.palette_size[1];
+  if (palette_mode_info.palette_size[1] > 0) {
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           palette_mode_info.palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+    memcpy(xd->plane[1].color_index_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
+
   return best_rd;
 }
 
@@ -4443,52 +4664,52 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-#if CONFIG_AFFINE_MOTION
-      // Use up-sampled reference frames.
-      struct macroblockd_plane *const pd = &xd->plane[0];
-      struct buf_2d backup_pred = pd->pre[0];
-      const YV12_BUFFER_CONFIG *upsampled_ref =
-          get_upsampled_ref(cpi, refs[id]);
+      if (cpi->sf.use_upsampled_references) {
+        // Use up-sampled reference frames.
+        struct macroblockd_plane *const pd = &xd->plane[0];
+        struct buf_2d backup_pred = pd->pre[0];
+        const YV12_BUFFER_CONFIG *upsampled_ref =
+            get_upsampled_ref(cpi, refs[id]);
 
-      // Set pred for Y plane
-      setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
-                       upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
-                       NULL, pd->subsampling_x, pd->subsampling_y);
+        // Set pred for Y plane
+        setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                         upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                         NULL, pd->subsampling_x, pd->subsampling_y);
 
-      // If bsize < BLOCK_8X8, adjust pred pointer for this block
-      if (bsize < BLOCK_8X8)
-        pd->pre[0].buf =
-            &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
-            pd->pre[0].stride)) << 3];
+        // If bsize < BLOCK_8X8, adjust pred pointer for this block
+        if (bsize < BLOCK_8X8)
+          pd->pre[0].buf =
+              &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
+              pd->pre[0].stride)) << 3];
 
-      bestsme = cpi->find_fractional_mv_step(
-          x, &tmp_mv,
-          &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv,
-          x->errorperbit,
-          &cpi->fn_ptr[bsize],
-          0, cpi->sf.mv.subpel_iters_per_step,
-          NULL,
-          x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-          pw, ph, 1);
+        bestsme = cpi->find_fractional_mv_step(
+            x, &tmp_mv,
+            &ref_mv[id].as_mv,
+            cpi->common.allow_high_precision_mv,
+            x->errorperbit,
+            &cpi->fn_ptr[bsize],
+            0, cpi->sf.mv.subpel_iters_per_step,
+            NULL,
+            x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred,
+            pw, ph, 1);
 
-      // Restore the reference frames.
-      pd->pre[0] = backup_pred;
-#else
-      (void) block;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &tmp_mv,
-          &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv,
-          x->errorperbit,
-          &cpi->fn_ptr[bsize],
-          0, cpi->sf.mv.subpel_iters_per_step,
-          NULL,
-          x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-          pw, ph);
-#endif
+        // Restore the reference frames.
+        pd->pre[0] = backup_pred;
+      } else {
+        (void) block;
+        bestsme = cpi->find_fractional_mv_step(
+            x, &tmp_mv,
+            &ref_mv[id].as_mv,
+            cpi->common.allow_high_precision_mv,
+            x->errorperbit,
+            &cpi->fn_ptr[bsize],
+            0, cpi->sf.mv.subpel_iters_per_step,
+            NULL,
+            x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred,
+            pw, ph, 0);
+        }
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -4769,57 +4990,57 @@
 
           if (bestsme < INT_MAX) {
             int distortion;
-#if CONFIG_AFFINE_MOTION
-            const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
-            const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
-            // Use up-sampled reference frames.
-            struct macroblockd_plane *const pd = &xd->plane[0];
-            struct buf_2d backup_pred = pd->pre[0];
-            const YV12_BUFFER_CONFIG *upsampled_ref =
-                get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+            if (cpi->sf.use_upsampled_references) {
+              const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+              const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+              // Use up-sampled reference frames.
+              struct macroblockd_plane *const pd = &xd->plane[0];
+              struct buf_2d backup_pred = pd->pre[0];
+              const YV12_BUFFER_CONFIG *upsampled_ref =
+                  get_upsampled_ref(cpi, mbmi->ref_frame[0]);
 
-            // Set pred for Y plane
-            setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
-                             upsampled_ref->y_stride,
-                             (mi_row << 3), (mi_col << 3),
-                             NULL, pd->subsampling_x, pd->subsampling_y);
+              // Set pred for Y plane
+              setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                               upsampled_ref->y_stride,
+                               (mi_row << 3), (mi_col << 3),
+                               NULL, pd->subsampling_x, pd->subsampling_y);
 
-            // adjust pred pointer for this block
-            pd->pre[0].buf =
-                &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
-                pd->pre[0].stride)) << 3];
+              // adjust pred pointer for this block
+              pd->pre[0].buf =
+                  &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
+                  pd->pre[0].stride)) << 3];
 
-            cpi->find_fractional_mv_step(
-                x,
-                new_mv,
-                &bsi->ref_mv[0]->as_mv,
-                cm->allow_high_precision_mv,
-                x->errorperbit, &cpi->fn_ptr[bsize],
-                cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
-                cond_cost_list(cpi, cost_list),
-                x->nmvjointcost, x->mvcost,
-                &distortion,
-                &x->pred_sse[mbmi->ref_frame[0]],
-                NULL, pw, ph, 1);
+              cpi->find_fractional_mv_step(
+                  x,
+                  new_mv,
+                  &bsi->ref_mv[0]->as_mv,
+                  cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list),
+                  x->nmvjointcost, x->mvcost,
+                  &distortion,
+                  &x->pred_sse[mbmi->ref_frame[0]],
+                  NULL, pw, ph, 1);
 
-            // Restore the reference frames.
-            pd->pre[0] = backup_pred;
-#else
-            cpi->find_fractional_mv_step(
-                x,
-                new_mv,
-                &bsi->ref_mv[0]->as_mv,
-                cm->allow_high_precision_mv,
-                x->errorperbit, &cpi->fn_ptr[bsize],
-                cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
-                cond_cost_list(cpi, cost_list),
-                x->nmvjointcost, x->mvcost,
-                &distortion,
-                &x->pred_sse[mbmi->ref_frame[0]],
-                NULL, 0, 0);
-#endif
+              // Restore the reference frames.
+              pd->pre[0] = backup_pred;
+            } else {
+              cpi->find_fractional_mv_step(
+                  x,
+                  new_mv,
+                  &bsi->ref_mv[0]->as_mv,
+                  cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list),
+                  x->nmvjointcost, x->mvcost,
+                  &distortion,
+                  &x->pred_sse[mbmi->ref_frame[0]],
+                  NULL, 0, 0, 0);
+            }
 
             // save motion search result for use in compound prediction
 #if CONFIG_EXT_INTER
@@ -5416,43 +5637,43 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-#if CONFIG_AFFINE_MOTION
-    const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
-    const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
-    // Use up-sampled reference frames.
-    struct macroblockd_plane *const pd = &xd->plane[0];
-    struct buf_2d backup_pred = pd->pre[ref_idx];
-    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    if (cpi->sf.use_upsampled_references) {
+      const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+      // Use up-sampled reference frames.
+      struct macroblockd_plane *const pd = &xd->plane[0];
+      struct buf_2d backup_pred = pd->pre[ref_idx];
+      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
 
-    // Set pred for Y plane
-    setup_pred_plane(&pd->pre[ref_idx], upsampled_ref->y_buffer,
-                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
-                     NULL, pd->subsampling_x, pd->subsampling_y);
+      // Set pred for Y plane
+      setup_pred_plane(&pd->pre[ref_idx], upsampled_ref->y_buffer,
+                       upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                       NULL, pd->subsampling_x, pd->subsampling_y);
 
-    bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
-                                           cm->allow_high_precision_mv,
-                                           x->errorperbit,
-                                           &cpi->fn_ptr[bsize],
-                                           cpi->sf.mv.subpel_force_stop,
-                                           cpi->sf.mv.subpel_iters_per_step,
-                                           cond_cost_list(cpi, cost_list),
-                                           x->nmvjointcost, x->mvcost,
-                                           &dis, &x->pred_sse[ref], NULL,
-                                           pw, ph, 1);
+      bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                             cm->allow_high_precision_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[bsize],
+                                             cpi->sf.mv.subpel_force_stop,
+                                             cpi->sf.mv.subpel_iters_per_step,
+                                             cond_cost_list(cpi, cost_list),
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &x->pred_sse[ref], NULL,
+                                             pw, ph, 1);
 
-    // Restore the reference frames.
-    pd->pre[ref_idx] = backup_pred;
-#else
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
-                                 cm->allow_high_precision_mv,
-                                 x->errorperbit,
-                                 &cpi->fn_ptr[bsize],
-                                 cpi->sf.mv.subpel_force_stop,
-                                 cpi->sf.mv.subpel_iters_per_step,
-                                 cond_cost_list(cpi, cost_list),
-                                 x->nmvjointcost, x->mvcost,
-                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
-#endif
+      // Restore the reference frames.
+      pd->pre[ref_idx] = backup_pred;
+    } else {
+      cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                   cm->allow_high_precision_mv,
+                                   x->errorperbit,
+                                   &cpi->fn_ptr[bsize],
+                                   cpi->sf.mv.subpel_force_stop,
+                                   cpi->sf.mv.subpel_iters_per_step,
+                                   cond_cost_list(cpi, cost_list),
+                                   x->nmvjointcost, x->mvcost,
+                                   &dis, &x->pred_sse[ref], NULL, 0, 0, 0);
+    }
   }
   *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -7066,6 +7287,63 @@
          vp10_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
 }
 
+static void restore_uv_color_map(VP10_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  double *const data = x->palette_buffer->kmeans_data_buf;
+  uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+  double centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  (void)cpi;
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth) {
+        data[(r * cols + c) * 2 ] =
+            src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] =
+            src_v16[r * src_stride + c];
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        data[(r * cols + c) * 2 ] =
+            src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] =
+            src_v[r * src_stride + c];
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  vp10_calc_indices(data, centroids, indices, rows * cols,
+                    pmi->palette_size[1], 2);
+
+  for (r = 0; r < rows; ++r)
+    for (c = 0; c < cols; ++c)
+      color_map[r * cols + c] = indices[r * cols + c];
+}
+
 void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
@@ -7082,6 +7360,7 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
@@ -7126,6 +7405,7 @@
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES];
 #if CONFIG_EXT_INTRA
   EXT_INTRA_MODE_INFO ext_intra_mode_info_uv[TX_SIZES];
   int8_t uv_angle_delta[TX_SIZES];
@@ -7153,6 +7433,11 @@
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int palette_ctx = 0;
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
 #if CONFIG_OBMC
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * 64 * 64]);
@@ -7188,6 +7473,14 @@
 #endif  // CONFIG_OBMC
 
   vp10_zero(best_mbmode);
+  vp10_zero(pmi_uv);
+
+  if (cm->allow_screen_content_tools) {
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  }
 
 #if CONFIG_EXT_INTRA
   memset(directional_mode_skip_mask, 0,
@@ -7370,8 +7663,6 @@
     midx = end_pos;
   }
 
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -7565,6 +7856,8 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
+    pmi->palette_size[0] = 0;
+    pmi->palette_size[1] = 0;
 #if CONFIG_EXT_INTRA
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
     mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
@@ -7597,7 +7890,6 @@
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
-
 #if CONFIG_EXT_INTRA
       is_directional_mode = (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED);
       if (is_directional_mode) {
@@ -7681,6 +7973,8 @@
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+        if (cm->allow_screen_content_tools)
+          pmi_uv[uv_tx] = *pmi;
 #if CONFIG_EXT_INTRA
         ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
         uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
@@ -7691,6 +7985,12 @@
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
+      if (cm->allow_screen_content_tools) {
+        pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+        memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+               pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+               2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+      }
 #if CONFIG_EXT_INTRA
       mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
       mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
@@ -7702,6 +8002,10 @@
 #endif  // CONFIG_EXT_INTRA
 
       rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
+        rate2 +=
+            vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                          [palette_ctx], 0);
 
       if (!xd->lossless[mbmi->segment_id]) {
         // super_block_yrd above includes the cost of the tx_size in the
@@ -8140,6 +8444,104 @@
       break;
   }
 
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (cm->allow_screen_content_tools && !is_inter_mode(best_mbmode.mode)) {
+    PREDICTION_MODE mode_selected;
+    int rate2 = 0, rate_y = 0;
+    int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
+    int skippable = 0, rate_overhead = 0;
+    TX_SIZE best_tx_size, uv_tx;
+    PALETTE_MODE_INFO palette_mode_info;
+    uint8_t *const best_palette_color_map =
+        x->palette_buffer->best_palette_color_map;
+    uint8_t *const color_map = xd->plane[0].color_index_map;
+
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = INTRA_FRAME;
+    mbmi->ref_frame[1] = NONE;
+    memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+    palette_mode_info.palette_size[0] = 0;
+    rate_overhead =
+        rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx,
+                                  intra_mode_cost[DC_PRED],
+                                  &palette_mode_info, best_palette_color_map,
+                                  &best_tx_size, &mode_selected, &dummy_rd);
+    if (palette_mode_info.palette_size[0] == 0)
+      goto PALETTE_EXIT;
+
+    pmi->palette_size[0] =
+        palette_mode_info.palette_size[0];
+    if (palette_mode_info.palette_size[0] > 0) {
+      memcpy(pmi->palette_colors, palette_mode_info.palette_colors,
+             PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+      memcpy(color_map, best_palette_color_map,
+             rows * cols * sizeof(best_palette_color_map[0]));
+    }
+    super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                    NULL, bsize, best_rd);
+    if (rate_y == INT_MAX)
+      goto PALETTE_EXIT;
+    uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize,
+                                xd->plane[1].subsampling_x,
+                                xd->plane[1].subsampling_y);
+    if (rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                           &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                           &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+      pmi_uv[uv_tx] = *pmi;
+#if CONFIG_EXT_INTRA
+      ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+    }
+    mbmi->uv_mode = mode_uv[uv_tx];
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0)
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+#if CONFIG_EXT_INTRA
+    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+    if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+          ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+    }
+#endif  // CONFIG_EXT_INTRA
+    skippable = skippable && skip_uv[uv_tx];
+    distortion2 = distortion_y + dist_uv[uv_tx];
+    rate2 = rate_y + rate_overhead + rate_uv_intra[uv_tx];
+    rate2 += ref_costs_single[INTRA_FRAME];
+
+    if (skippable) {
+      rate2 -= (rate_y + rate_uv_tokenonly[uv_tx]);
+      rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+    } else {
+      rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+    }
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    if (this_rd < best_rd) {
+      int max_plane = MAX_MB_PLANE;
+      best_mode_index = 3;
+      mbmi->mv[0].as_int = 0;
+      max_plane = 1;
+      rd_cost->rate = rate2;
+      rd_cost->dist = distortion2;
+      rd_cost->rdcost = this_rd;
+      best_rd = this_rd;
+      best_mbmode = *mbmi;
+      best_skip2 = 0;
+      best_mode_skippable = skippable;
+      if (!x->select_tx_size)
+        swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+      memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+             sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+    }
+  }
+  PALETTE_EXIT:
+
   // The inter modes' rate costs are not calculated precisely in some cases.
   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
   // ZEROMV. Here, checks are added for those cases, and the mode decisions
@@ -8366,6 +8768,10 @@
 
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_filter_diff, best_mode_skippable);
+
+  if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+    restore_uv_color_map(cpi, x);
+  }
 }
 
 void vp10_rd_pick_inter_mode_sb_seg_skip(VP10_COMP *cpi,
@@ -8951,16 +9357,6 @@
         if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
             mbmi->interp_filter != EIGHTTAP_REGULAR) {
           mbmi->interp_filter = EIGHTTAP_REGULAR;
-          tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
-                   &x->mbmi_ext->ref_mvs[ref_frame][0],
-                   second_ref, best_yrd, &rate, &rate_y,
-                   &distortion, &skippable, &total_sse,
-                   (int) this_rd_thresh, seg_mvs,
-#if CONFIG_EXT_INTER
-                   compound_seg_newmvs,
-#endif  // CONFIG_EXT_INTER
-                   bsi, 0,
-                   mi_row, mi_col);
         }
 #endif  // CONFIG_EXT_INTERP
         if (tmp_rd == INT64_MAX)
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index f4d9b95..174ad4d 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -103,13 +103,16 @@
                                          int tmp_stride[MAX_MB_PLANE]);
 #endif  // CONFIG_OBMC
 
-#if CONFIG_AFFINE_MOTION
 static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
                                                           const int ref) {
   // Use up-sampled reference frames.
   int ref_idx = 0;
   if (ref == LAST_FRAME)
+#if CONFIG_EXT_REFS
+    ref_idx = cpi->lst_fb_idxes[ref - LAST_FRAME];
+#else
     ref_idx = cpi->lst_fb_idx;
+#endif
   else if (ref == GOLDEN_FRAME)
     ref_idx = cpi->gld_fb_idx;
   else if (ref == ALTREF_FRAME)
@@ -117,7 +120,6 @@
 
   return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
 }
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index ec8acda..169ae2c 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -177,6 +177,7 @@
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
+    sf->use_upsampled_references = 0;
 #if CONFIG_EXT_TX
     sf->tx_type_search = PRUNE_TWO;
 #endif
@@ -279,6 +280,7 @@
   sf->use_fast_coef_costing = 1;
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
+  sf->use_upsampled_references = 0;
 
   // Use transform domain distortion computation
   // Note var-tx expt always uses pixel domain distortion.
@@ -495,6 +497,11 @@
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
+#if CONFIG_EXT_REFS
+  sf->use_upsampled_references = 0;
+#else
+  sf->use_upsampled_references = 1;
+#endif
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index fbb6988..02ee204 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -479,6 +479,9 @@
   // Fast approximation of vp10_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
+  // Do sub-pixel search in up-sampled reference frames
+  int use_upsampled_references;
+
   // Whether to compute distortion in the image domain (slower but
   // more accurate), or in the transform domain (faster but less acurate).
   int use_transform_domain_distortion;
diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index 3e1246a..b3cf899 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@@ -320,11 +320,7 @@
                                          0, mv_sf->subpel_iters_per_step,
                                          cond_cost_list(cpi, cost_list),
                                          NULL, NULL,
-#if CONFIG_AFFINE_MOTION
                                          &distortion, &sse, NULL, 0, 0, 0);
-#else
-                                         &distortion, &sse, NULL, 0, 0);
-#endif
 
   // Restore input state
   x->plane[0].src = src;
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 5cae8e3..c71c985 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -398,13 +398,18 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  uint8_t *color_map = xd->plane[0].color_index_map;
+  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
   PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
   int n = pmi->palette_size[plane != 0];
   int i, j, k;
   int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_x);
+  const vpx_prob (* const probs)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
+      plane == 0 ? vp10_default_palette_y_color_prob :
+          vp10_default_palette_uv_color_prob;
 
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
@@ -416,9 +421,8 @@
           break;
         }
       assert(color_new_idx >= 0 && color_new_idx < n);
-
       (*t)->token = color_new_idx;
-      (*t)->context_tree = vp10_default_palette_y_color_prob[n - 2][color_ctx];
+      (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 169769a..ee1e305 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -267,7 +267,6 @@
   }
 }
 
-#if CONFIG_AFFINE_MOTION
 // Get pred block from up-sampled reference.
 void vpx_upsampled_pred_c(uint8_t *comp_pred,
                           int width, int height,
@@ -300,7 +299,6 @@
       ref += stride;
     }
 }
-#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 583d9fa..e5c002a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1233,12 +1233,10 @@
 #
 # ...
 #
-if (vpx_config("CONFIG_AFFINE_MOTION") eq "yes") {
-  add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
-    specialize qw/vpx_upsampled_pred sse2/;
-  add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-    specialize qw/vpx_comp_avg_upsampled_pred sse2/;
-}
+add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/vpx_upsampled_pred sse2/;
+add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/vpx_comp_avg_upsampled_pred sse2/;
 
 #
 # ...
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 7943c84..63fc1e6 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -476,7 +476,6 @@
 #undef FN
 #endif  // CONFIG_USE_X86INC
 
-#if CONFIG_AFFINE_MOTION
 void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
                              int width, int height,
                              const uint8_t *ref,  int ref_stride) {
@@ -703,4 +702,3 @@
       }
     }
 }
-#endif