Merge "Speed up tm_predictor_16x16"
diff --git a/.mailmap b/.mailmap
index 42f3617..4672e5c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,5 +1,4 @@
 Adrian Grange <agrange@google.com>
-Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
 Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
@@ -8,13 +7,11 @@
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
-Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
 Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
@@ -33,4 +30,3 @@
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
-Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d..ae5ba18 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -29,11 +29,14 @@
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
-         armv7-darwin-gcc
-         armv7s-darwin-gcc
-         x86-iphonesimulator-gcc
-         x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+             x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
 # target using $DIST_DIR as the distribution output directory.
@@ -197,15 +200,27 @@
   fi
 }
 
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
 iosbuild_usage() {
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
     --targets <targets>: Override default target list. Defaults:
-         ${TARGETS}
+$(print_list "        " ${TARGETS})
     --test-link: Confirms all targets can be linked. Functionally identical to
                  passing --enable-examples via --extra-configure-args.
     --verbose: Output information about the environment and each stage of the
@@ -249,6 +264,9 @@
       TARGETS="$2"
       shift
       ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -273,10 +291,12 @@
   MAKEFLAGS=${MAKEFLAGS}
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
-  TARGETS="${TARGETS}"
+  TARGETS="$(print_list "" ${TARGETS})"
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
 EOF
 fi
 
 build_framework "${TARGETS}"
 echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo "         ${TARGETS}"
+print_list "" ${TARGETS}
diff --git a/test/test.mk b/test/test.mk
index 8d66244..2487bd2 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -92,10 +92,9 @@
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@@ -105,10 +104,10 @@
 
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
@@ -121,7 +120,7 @@
 endif # VP8
 
 ## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@@ -134,25 +133,25 @@
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif
 
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
+LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
-
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -162,14 +161,19 @@
 
 endif # VP9
 
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
-
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
-
 ## VP10
+ifeq ($(CONFIG_VP10),yes)
+
+LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+
+endif # VP10
+
+## Multi-codec / unconditional whitebox tests.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+
+TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
 endif # CONFIG_SHARED
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index d44a64a..4064ea6 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,23 +187,19 @@
                 vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                 vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
 
-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
-                vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
-                vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
-
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse2)
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
-                NULL, vpx_d153_predictor_4x4_ssse3,
-                vpx_d207_predictor_4x4_ssse3, vpx_d63_predictor_4x4_ssse3, NULL)
+                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+                vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_DSPR2
@@ -240,23 +236,19 @@
                 vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                 vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 
-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred8, vpx_dc_predictor_8x8_sse,
-                vpx_dc_left_predictor_8x8_sse, vpx_dc_top_predictor_8x8_sse,
-                vpx_dc_128_predictor_8x8_sse, vpx_v_predictor_8x8_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
-
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_8x8_ssse3, vpx_d45_predictor_8x8_ssse3, NULL,
-                NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
+                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
+                vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_DSPR2
@@ -298,13 +290,13 @@
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_16x16_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_16x16_ssse3, vpx_d45_predictor_16x16_ssse3,
+                NULL, vpx_d45_predictor_16x16_ssse3,
                 NULL, NULL, vpx_d153_predictor_16x16_ssse3,
                 vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
                 NULL)
@@ -350,23 +342,23 @@
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_sse2)
 #else
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL)
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_32x32_ssse3, vpx_d45_predictor_32x32_ssse3,
-                NULL, NULL, vpx_d153_predictor_32x32_ssse3,
-                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
-                NULL)
+                NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_NEON
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 290bdc7..cbc667e 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -372,7 +372,10 @@
     ::testing::Values(
         make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
         make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProRowTest, ::testing::Values(
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..e7d3fa5 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
 %if ABI_IS_32BIT
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
                 extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
                 %define RESTORE_GOT pop %1
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %macro GET_GOT 1
                 push %1
                 call %%get_got
@@ -149,6 +149,8 @@
                 %undef RESTORE_GOT
                 %define RESTORE_GOT pop %1
             %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
         %endif
     %endif
 
diff --git a/tools/gen_authors.sh b/tools/gen_authors.sh
index e1246f0..4cfd81e 100755
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh
@@ -6,7 +6,7 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq)
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index b89d791..2f10378 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -63,20 +63,6 @@
 #define MAX_REF_FRAMES  4
 typedef int8_t MV_REFERENCE_FRAME;
 
-typedef struct {
-  // Number of base colors for Y (0) and UV (1)
-  uint8_t palette_size[2];
-  // Value of base colors for Y, U, and V
-#if CONFIG_VP9_HIGHBITDEPTH
-  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
-#else
-  uint8_t palette_colors[3 * PALETTE_MAX_SIZE];
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  // Only used by encoder to store the color index of the top left pixel.
-  // TODO(huisu): move this to encoder
-  uint8_t palette_first_color_idx[2];
-} PALETTE_MODE_INFO;
-
 // This structure now relates to 8x8 block regions.
 typedef struct {
   // Common for both INTER and INTRA blocks
@@ -92,7 +78,6 @@
 
   // Only for INTRA blocks
   PREDICTION_MODE uv_mode;
-  PALETTE_MODE_INFO palette_mode_info;
 
   // Only for INTER blocks
   INTERP_FILTER interp_filter;
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 8098305..2bb292a 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -279,442 +279,6 @@
     { 66  } }
 };
 
-const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
-    -TWO_COLORS, 2,
-    -THREE_COLORS, 4,
-    -FOUR_COLORS, 6,
-    -FIVE_COLORS, 8,
-    -SIX_COLORS, 10,
-    -SEVEN_COLORS, -EIGHT_COLORS,
-};
-
-// TODO(huisu): tune these probs
-const vpx_prob
-vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
-    {  96,  89, 100,  64,  77, 130},
-    {  22,  15,  44,  16,  34,  82},
-    {  30,  19,  57,  18,  38,  86},
-    {  94,  36, 104,  23,  43,  92},
-    { 116,  76, 107,  46,  65, 105},
-    { 112,  82,  94,  40,  70, 112},
-    { 147, 124, 123,  58,  69, 103},
-    { 180, 113, 136,  49,  45, 114},
-    { 107,  70,  87,  49, 154, 156},
-    {  98, 105, 142,  63,  64, 152},
-};
-
-const vpx_prob
-vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
-    { 160, 196, 228, 213, 175, 230},
-    {  87, 148, 208, 141, 166, 163},
-    {  72, 151, 204, 139, 155, 161},
-    {  78, 135, 171, 104, 120, 173},
-    {  59,  92, 131,  78,  92, 142},
-    {  75, 118, 149,  84,  90, 128},
-    {  89,  87,  92,  66,  66, 128},
-    {  67,  53,  54,  55,  66,  93},
-    { 120, 130,  83, 171,  75, 214},
-    {  72,  55,  66,  68,  79, 107},
-};
-
-const vpx_prob
-vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
-                                                      = {
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-    { 240,  180,  100, },
-};
-
-
-const vpx_prob default_uv_palette_mode_prob[2] = {
-    253, 229
-};
-
-const vpx_tree_index
-vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)] = {
-    {  // 2 colors
-        -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO,
-    },
-    {  // 3 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE,
-    },
-    {  // 4 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, 4,
-        -PALETTE_COLOR_THREE, -PALETTE_COLOR_FOUR,
-    },
-    {  // 5 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, 4,
-        -PALETTE_COLOR_THREE, 6,
-        -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE,
-    },
-    {  // 6 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, 4,
-        -PALETTE_COLOR_THREE, 6,
-        -PALETTE_COLOR_FOUR, 8,
-        -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX,
-    },
-    {  // 7 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, 4,
-        -PALETTE_COLOR_THREE, 6,
-        -PALETTE_COLOR_FOUR, 8,
-        -PALETTE_COLOR_FIVE, 10,
-        -PALETTE_COLOR_SIX, -PALETTE_COLOR_SEVEN,
-    },
-    {  // 8 colors
-        -PALETTE_COLOR_ONE, 2,
-        -PALETTE_COLOR_TWO, 4,
-        -PALETTE_COLOR_THREE, 6,
-        -PALETTE_COLOR_FOUR, 8,
-        -PALETTE_COLOR_FIVE, 10,
-        -PALETTE_COLOR_SIX, 12,
-        -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT,
-    },
-};
-
-const vpx_prob vp10_default_palette_y_color_prob
-[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
-    {  // 2 colors
-        { 230, 255, 128, 128, 128, 128, 128 },
-        { 214, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 240, 255, 128, 128, 128, 128, 128 },
-        {  73, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 130, 255, 128, 128, 128, 128, 128 },
-        { 227, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 188, 255, 128, 128, 128, 128, 128 },
-        {  75, 255, 128, 128, 128, 128, 128 },
-        { 250, 255, 128, 128, 128, 128, 128 },
-        { 223, 255, 128, 128, 128, 128, 128 },
-        { 252, 255, 128, 128, 128, 128, 128 },
-    }, {  // 3 colors
-        { 229, 137, 255, 128, 128, 128, 128 },
-        { 197, 120, 255, 128, 128, 128, 128 },
-        { 107, 195, 255, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        {  27, 151, 255, 128, 128, 128, 128 },
-        { 230, 130, 255, 128, 128, 128, 128 },
-        {  37, 230, 255, 128, 128, 128, 128 },
-        {  67, 221, 255, 128, 128, 128, 128 },
-        { 124, 230, 255, 128, 128, 128, 128 },
-        { 195, 109, 255, 128, 128, 128, 128 },
-        {  99, 122, 255, 128, 128, 128, 128 },
-        { 205, 208, 255, 128, 128, 128, 128 },
-        {  40, 235, 255, 128, 128, 128, 128 },
-        { 251, 132, 255, 128, 128, 128, 128 },
-        { 237, 186, 255, 128, 128, 128, 128 },
-        { 253, 112, 255, 128, 128, 128, 128 },
-    }, {  // 4 colors
-        { 195,  87, 128, 255, 128, 128, 128 },
-        { 143, 100, 123, 255, 128, 128, 128 },
-        {  94, 124, 119, 255, 128, 128, 128 },
-        {  77,  91, 130, 255, 128, 128, 128 },
-        {  39, 114, 178, 255, 128, 128, 128 },
-        { 222,  94, 125, 255, 128, 128, 128 },
-        {  44, 203, 132, 255, 128, 128, 128 },
-        {  68, 175, 122, 255, 128, 128, 128 },
-        { 110, 187, 124, 255, 128, 128, 128 },
-        { 152,  91, 128, 255, 128, 128, 128 },
-        {  70, 109, 181, 255, 128, 128, 128 },
-        { 133, 113, 164, 255, 128, 128, 128 },
-        {  47, 205, 133, 255, 128, 128, 128 },
-        { 247,  94, 136, 255, 128, 128, 128 },
-        { 205, 122, 146, 255, 128, 128, 128 },
-        { 251, 100, 141, 255, 128, 128, 128 },
-    }, {  // 5 colors
-        { 195,  65,  84, 125, 255, 128, 128 },
-        { 150,  76,  84, 121, 255, 128, 128 },
-        {  94, 110,  81, 117, 255, 128, 128 },
-        {  79,  85,  91, 139, 255, 128, 128 },
-        {  26, 102, 139, 127, 255, 128, 128 },
-        { 220,  73,  91, 119, 255, 128, 128 },
-        {  38, 203,  86, 127, 255, 128, 128 },
-        {  61, 186,  72, 124, 255, 128, 128 },
-        { 132, 199,  84, 128, 255, 128, 128 },
-        { 172,  52,  62, 120, 255, 128, 128 },
-        { 102,  89, 121, 122, 255, 128, 128 },
-        { 182,  48,  69, 186, 255, 128, 128 },
-        {  36, 206,  87, 126, 255, 128, 128 },
-        { 249,  55,  67, 122, 255, 128, 128 },
-        { 218,  88,  75, 122, 255, 128, 128 },
-        { 253,  64,  80, 119, 255, 128, 128 },
-    }, {  // 6 colors
-        { 182,  54,  64,  75, 118, 255, 128 },
-        { 126,  67,  70,  76, 116, 255, 128 },
-        {  79,  92,  67,  85, 120, 255, 128 },
-        {  63,  61,  81, 118, 132, 255, 128 },
-        {  21,  80, 105,  83, 119, 255, 128 },
-        { 215,  72,  74,  74, 111, 255, 128 },
-        {  50, 176,  63,  79, 120, 255, 128 },
-        {  72, 148,  66,  77, 120, 255, 128 },
-        { 105, 177,  57,  78, 130, 255, 128 },
-        { 150,  66,  66,  80, 127, 255, 128 },
-        {  81,  76, 109,  85, 116, 255, 128 },
-        { 113,  81,  62,  96, 148, 255, 128 },
-        {  54, 179,  69,  82, 121, 255, 128 },
-        { 244,  47,  48,  67, 118, 255, 128 },
-        { 198,  83,  53,  65, 121, 255, 128 },
-        { 250,  42,  51,  69, 110, 255, 128 },
-    }, {  // 7 colors
-        { 182,  45,  54,  62,  74, 113, 255 },
-        { 124,  63,  57,  62,  77, 114, 255 },
-        {  77,  80,  56,  66,  76, 117, 255 },
-        {  63,  57,  69,  98,  85, 131, 255 },
-        {  19,  81,  98,  63,  80, 116, 255 },
-        { 215,  56,  60,  63,  68, 105, 255 },
-        {  50, 174,  50,  60,  79, 118, 255 },
-        {  68, 151,  50,  58,  73, 117, 255 },
-        { 104, 182,  53,  57,  79, 127, 255 },
-        { 156,  50,  51,  63,  77, 111, 255 },
-        {  88,  67,  97,  59,  82, 120, 255 },
-        { 114,  81,  46,  65, 103, 132, 255 },
-        {  55, 166,  57,  66,  82, 120, 255 },
-        { 245,  34,  38,  43,  63, 114, 255 },
-        { 203,  68,  45,  47,  60, 118, 255 },
-        { 250,  35,  37,  47,  66, 110, 255 },
-    }, {  // 8 colors
-        { 180,  43,  46,  50,  56,  69, 109 },
-        { 116,  53,  51,  49,  57,  73, 115 },
-        {  79,  70,  49,  50,  59,  74, 117 },
-        {  60,  54,  57,  70,  62,  83, 129 },
-        {  20,  73,  85,  52,  66,  81, 119 },
-        { 213,  56,  52,  49,  53,  62, 104 },
-        {  48, 161,  41,  45,  56,  77, 116 },
-        {  68, 139,  40,  47,  54,  71, 116 },
-        { 123, 166,  42,  43,  52,  76, 130 },
-        { 153,  44,  44,  47,  54,  79, 129 },
-        {  87,  64,  83,  49,  60,  75, 127 },
-        { 131,  68,  43,  48,  73,  96, 130 },
-        {  55, 152,  45,  51,  64,  77, 113 },
-        { 243,  30,  28,  33,  41,  65, 114 },
-        { 202,  56,  35,  36,  42,  63, 123 },
-        { 249,  31,  29,  32,  45,  68, 111 },
-    }
-};
-
-const vpx_prob vp10_default_palette_uv_color_prob
-[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
-    {  // 2 colors
-        { 228, 255, 128, 128, 128, 128, 128 },
-        { 195, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 228, 255, 128, 128, 128, 128, 128 },
-        {  71, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 129, 255, 128, 128, 128, 128, 128 },
-        { 206, 255, 128, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        { 136, 255, 128, 128, 128, 128, 128 },
-        {  98, 255, 128, 128, 128, 128, 128 },
-        { 236, 255, 128, 128, 128, 128, 128 },
-        { 222, 255, 128, 128, 128, 128, 128 },
-        { 249, 255, 128, 128, 128, 128, 128 },
-    }, {  // 3 colors
-        { 198, 136, 255, 128, 128, 128, 128 },
-        { 178, 105, 255, 128, 128, 128, 128 },
-        { 100, 206, 255, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128 },
-        {  12, 136, 255, 128, 128, 128, 128 },
-        { 219, 134, 255, 128, 128, 128, 128 },
-        {  50, 198, 255, 128, 128, 128, 128 },
-        {  61, 231, 255, 128, 128, 128, 128 },
-        { 110, 209, 255, 128, 128, 128, 128 },
-        { 173, 106, 255, 128, 128, 128, 128 },
-        { 145, 166, 255, 128, 128, 128, 128 },
-        { 156, 175, 255, 128, 128, 128, 128 },
-        {  69, 183, 255, 128, 128, 128, 128 },
-        { 241, 163, 255, 128, 128, 128, 128 },
-        { 224, 160, 255, 128, 128, 128, 128 },
-        { 246, 154, 255, 128, 128, 128, 128 },
-    }, {  // 4 colors
-        { 173,  88, 143, 255, 128, 128, 128 },
-        { 146,  81, 127, 255, 128, 128, 128 },
-        {  84, 134, 102, 255, 128, 128, 128 },
-        {  69, 138, 140, 255, 128, 128, 128 },
-        {  31, 103, 200, 255, 128, 128, 128 },
-        { 217, 101, 139, 255, 128, 128, 128 },
-        {  51, 174, 121, 255, 128, 128, 128 },
-        {  64, 177, 109, 255, 128, 128, 128 },
-        {  96, 179, 145, 255, 128, 128, 128 },
-        { 164,  77, 114, 255, 128, 128, 128 },
-        {  87,  94, 156, 255, 128, 128, 128 },
-        { 105,  57, 173, 255, 128, 128, 128 },
-        {  63, 158, 137, 255, 128, 128, 128 },
-        { 236, 102, 156, 255, 128, 128, 128 },
-        { 197, 115, 153, 255, 128, 128, 128 },
-        { 245, 106, 154, 255, 128, 128, 128 },
-    }, {  // 5 colors
-        { 179,  64,  97, 129, 255, 128, 128 },
-        { 137,  56,  88, 125, 255, 128, 128 },
-        {  82, 107,  61, 118, 255, 128, 128 },
-        {  59, 113,  86, 115, 255, 128, 128 },
-        {  23,  88, 118, 130, 255, 128, 128 },
-        { 213,  66,  90, 125, 255, 128, 128 },
-        {  37, 181, 103, 121, 255, 128, 128 },
-        {  47, 188,  61, 131, 255, 128, 128 },
-        { 104, 185, 103, 144, 255, 128, 128 },
-        { 163,  39,  76, 112, 255, 128, 128 },
-        {  94,  74, 131, 126, 255, 128, 128 },
-        { 142,  42, 103, 163, 255, 128, 128 },
-        {  53, 162,  99, 149, 255, 128, 128 },
-        { 239,  54,  84, 108, 255, 128, 128 },
-        { 203,  84, 110, 147, 255, 128, 128 },
-        { 248,  70, 105, 151, 255, 128, 128 },
-    }, {  // 6 colors
-        { 189,  50,  67,  90, 130, 255, 128 },
-        { 114,  50,  55,  90, 123, 255, 128 },
-        {  66,  76,  54,  82, 128, 255, 128 },
-        {  43,  69,  69,  80, 129, 255, 128 },
-        {  22,  59,  87,  88, 141, 255, 128 },
-        { 203,  49,  68,  87, 122, 255, 128 },
-        {  43, 157,  74, 104, 146, 255, 128 },
-        {  54, 138,  51,  95, 138, 255, 128 },
-        {  82, 171,  58, 102, 146, 255, 128 },
-        { 129,  38,  59,  64, 168, 255, 128 },
-        {  56,  67, 119,  92, 112, 255, 128 },
-        {  96,  62,  53, 132,  82, 255, 128 },
-        {  60, 147,  77, 108, 145, 255, 128 },
-        { 238,  76,  73,  93, 148, 255, 128 },
-        { 189,  86,  73, 103, 157, 255, 128 },
-        { 246,  62,  75,  83, 167, 255, 128 },
-    }, {  // 7 colors
-        { 179,  42,  51,  73,  99, 134, 255 },
-        { 119,  52,  52,  61,  64, 114, 255 },
-        {  53,  77,  35,  65,  71, 131, 255 },
-        {  38,  70,  51,  68,  89, 144, 255 },
-        {  23,  65, 128,  73,  97, 131, 255 },
-        { 210,  47,  52,  63,  81, 143, 255 },
-        {  42, 159,  57,  68,  98, 143, 255 },
-        {  49, 153,  45,  82,  93, 143, 255 },
-        {  81, 169,  52,  72, 113, 151, 255 },
-        { 136,  46,  35,  56,  75,  96, 255 },
-        {  57,  84, 109,  47, 107, 131, 255 },
-        { 128,  78,  57,  36, 128,  85, 255 },
-        {  54, 149,  68,  77,  94, 153, 255 },
-        { 243,  58,  50,  71,  81, 167, 255 },
-        { 189,  92,  64,  70, 121, 173, 255 },
-        { 248,  35,  38,  51,  82, 201, 255 },
-    }, {  // 8 colors
-        { 201,  40,  36,  42,  64,  92, 123 },
-        { 116,  43,  33,  43,  73, 102, 128 },
-        {  46,  77,  37,  69,  62,  78, 150 },
-        {  40,  65,  52,  50,  76,  89, 133 },
-        {  28,  48,  91,  17,  64,  77, 133 },
-        { 218,  43,  43,  37,  56,  72, 163 },
-        {  41, 155,  44,  83,  82, 129, 180 },
-        {  44, 141,  29,  55,  64,  89, 147 },
-        {  92, 166,  48,  45,  59, 126, 179 },
-        { 169,  35,  49,  41,  36,  99, 139 },
-        {  55,  77,  77,  56,  60,  75, 156 },
-        { 155,  81,  51,  64,  57, 182, 255 },
-        {  60, 134,  49,  49,  93, 128, 174 },
-        { 244,  98,  51,  46,  22,  73, 238 },
-        { 189,  70,  40,  87,  93,  79, 201 },
-        { 248,  54,  49,  40,  29,  42, 227 },
-    }
-};
-
-static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
-    // (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
-    3993,  4235,  4378,  4380,
-    // (4, 3, 3, 0), (5, 0, 0, 0), (5, 3, 0, 0), (5, 3, 2, 0),
-    5720,  6655,  7018,  7040,
-    // (5, 5, 0, 0), (6, 2, 0, 0), (6, 2, 2, 0), (6, 4, 0, 0),
-    7260,  8228,  8250,  8470,
-    // (7, 3, 0, 0), (8, 0, 0, 0), (8, 2, 0, 0), (10, 0, 0, 0)
-    9680, 10648, 10890, 13310
-};
-
-int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
-                                   int r, int c, int n, int *color_order) {
-  int i, j, max, max_idx, temp;
-  int scores[PALETTE_MAX_SIZE + 10];
-  int weights[4] = {3, 2, 3, 2};
-  int color_ctx = 0;
-  int color_neighbors[4];
-
-  assert(n <= PALETTE_MAX_SIZE);
-
-  if (c - 1 >= 0)
-    color_neighbors[0] = color_map[r * cols + c - 1];
-  else
-    color_neighbors[0] = -1;
-  if (c - 1 >= 0 && r - 1 >= 0)
-    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
-  else
-    color_neighbors[1] = -1;
-  if (r - 1 >= 0)
-    color_neighbors[2] = color_map[(r - 1) * cols + c];
-  else
-    color_neighbors[2] = -1;
-  if (r - 1 >= 0 && c + 1 <= cols - 1)
-    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
-  else
-    color_neighbors[3] = -1;
-
-  for (i = 0; i < PALETTE_MAX_SIZE; ++i)
-    color_order[i] = i;
-  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
-  for (i = 0; i < 4; ++i) {
-    if (color_neighbors[i] >= 0)
-      scores[color_neighbors[i]] += weights[i];
-  }
-
-  for (i = 0; i < 4; ++i) {
-    max = scores[i];
-    max_idx = i;
-    j = i + 1;
-    while (j < n) {
-      if (scores[j] > max) {
-        max = scores[j];
-        max_idx = j;
-      }
-      ++j;
-    }
-
-    if (max_idx != i) {
-      temp = scores[i];
-      scores[i] = scores[max_idx];
-      scores[max_idx] = temp;
-
-      temp = color_order[i];
-      color_order[i] = color_order[max_idx];
-      color_order[max_idx] = temp;
-    }
-  }
-
-  for (i = 0; i < 4; ++i)
-    color_ctx = color_ctx * 11 + scores[i];
-
-  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
-    if (color_ctx == palette_color_context_lookup[i]) {
-      color_ctx = i;
-      break;
-    }
-
-  if (color_ctx >= PALETTE_COLOR_CONTEXTS)
-    color_ctx = 0;
-
-  return color_ctx;
-}
-
 void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]) {
   ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index c9b667b..42fd920 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -27,12 +27,6 @@
 
 #define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
-#define PALETTE_COLOR_CONTEXTS 16
-#define PALETTE_MAX_SIZE 8
-#define PALETTE_BLOCK_SIZES (BLOCK_64X64 - BLOCK_8X8 + 1)
-#define PALETTE_Y_MODE_CONTEXTS 3
-
-
 struct VP10Common;
 
 struct tx_probs {
@@ -105,25 +99,12 @@
 extern const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
 #endif
-extern const vpx_prob
-vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS];
-extern const vpx_prob
-vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
-extern const vpx_prob
-vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
-extern const vpx_prob vp10_default_palette_y_color_prob
-[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
-extern const vpx_prob vp10_default_palette_uv_color_prob
-[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
 
 extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern const vpx_tree_index vp10_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
-extern const vpx_tree_index
-vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)];
 
 
 void vp10_setup_past_independence(struct VP10Common *cm);
@@ -147,9 +128,6 @@
   return i;
 }
 
-int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
-                                   int r, int c, int n, int *color_order);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 5f67e30..a226a2d 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -109,29 +109,6 @@
   PLANE_TYPES
 } PLANE_TYPE;
 
-typedef enum {
-  TWO_COLORS,
-  THREE_COLORS,
-  FOUR_COLORS,
-  FIVE_COLORS,
-  SIX_COLORS,
-  SEVEN_COLORS,
-  EIGHT_COLORS,
-  PALETTE_SIZES
-} PALETTE_SIZE;
-
-typedef enum {
-  PALETTE_COLOR_ONE,
-  PALETTE_COLOR_TWO,
-  PALETTE_COLOR_THREE,
-  PALETTE_COLOR_FOUR,
-  PALETTE_COLOR_FIVE,
-  PALETTE_COLOR_SIX,
-  PALETTE_COLOR_SEVEN,
-  PALETTE_COLOR_EIGHT,
-  PALETTE_COLORS
-} PALETTE_COLOR;
-
 #define DC_PRED    0       // Average of above and left pixels
 #define V_PRED     1       // Vertical
 #define H_PRED     2       // Horizontal
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index 6814133..ffef733 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -185,8 +185,6 @@
 
   int allow_high_precision_mv;
 
-  int allow_screen_content_tools;
-
   // Flag signaling which frame contexts should be reset to default values.
   RESET_FRAME_CONTEXT_MODE reset_frame_context;
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 14af7eb..e9e3949 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -262,7 +262,7 @@
 }
 
 #if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
@@ -753,40 +753,6 @@
   const int have_right = (aoff + txw) < bw;
 #endif  // CONFIG_MISC_FIXES
 
-  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
-    const int bs = 4 * (1 << tx_size);
-    const int stride = 4 * (1 << bwl_in);
-    int r, c;
-    uint8_t *map = NULL;
-#if CONFIG_VP9_HIGHBITDEPTH
-    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
-        plane * PALETTE_MAX_SIZE;
-#else
-    uint8_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
-        plane * PALETTE_MAX_SIZE;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    map = xd->plane[plane != 0].color_index_map;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (r = 0; r < bs; ++r)
-        for (c = 0; c < bs; ++c)
-          dst16[r * dst_stride + c] =
-              palette[map[(r + y) * stride + c + x]];
-    } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      for (r = 0; r < bs; ++r)
-        for (c = 0; c < bs; ++c)
-          dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
-#if CONFIG_VP9_HIGHBITDEPTH
-    }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    return;
-  }
-
 #if CONFIG_MISC_FIXES
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 5c95e16..70d012b 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -851,9 +851,6 @@
       const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
           0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-      if (plane <= 1 && mbmi->palette_mode_info.palette_size[plane])
-          vp10_decode_palette_tokens(xd, plane, r);
-
       for (row = 0; row < max_blocks_high; row += step)
         for (col = 0; col < max_blocks_wide; col += step)
           predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
@@ -1155,16 +1152,12 @@
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
   cm->dequant_bit_depth = cm->bit_depth;
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
@@ -1954,8 +1947,6 @@
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
-    if (frame_is_intra_only(cm))
-      cm->allow_screen_content_tools = vpx_rb_read_bit(rb);
   } else {
     cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 38ea073..b516333 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -246,38 +246,6 @@
   }
 }
 
-static void read_palette_mode_info(VP10_COMMON *const cm,
-                                   MACROBLOCKD *const xd,
-                                   vpx_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi  = xd->left_mi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int i, palette_ctx = 0;
-
-  if (above_mi)
-    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (vpx_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                   [palette_ctx])) {
-    int n;
-    PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-
-    pmi->palette_size[0] =
-        vpx_read_tree(r, vp10_palette_size_tree,
-                      vp10_default_palette_y_size_prob[bsize - BLOCK_8X8]) + 2;
-    n = pmi->palette_size[0];
-
-    for (i = 0; i < n; ++i)
-      pmi->palette_colors[i] = vpx_read_literal(r, cm->bit_depth);
-
-    xd->plane[0].color_index_map[0] = read_uniform(r, n);
-    assert(xd->plane[0].color_index_map[0] < n);
-  }
-}
-
 static void read_intra_frame_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd,
                                        int mi_row, int mi_col, vpx_reader *r) {
@@ -326,12 +294,6 @@
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
-
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
-      mbmi->mode == DC_PRED)
-    read_palette_mode_info(cm, xd, r);
 }
 
 static int read_mv_component(vpx_reader *r,
@@ -495,9 +457,6 @@
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
-
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index cda9888..d39e3dc 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -257,33 +257,6 @@
   }
 }
 
-void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
-                                vpx_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-  int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
-  int n = mbmi->palette_mode_info.palette_size[plane != 0];
-  int i, j;
-  uint8_t *color_map = xd->plane[plane].color_index_map;
-  const vpx_prob (* prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
-      plane ? vp10_default_palette_uv_color_prob :
-          vp10_default_palette_y_color_prob;
-
-  for (i = 0; i < rows; ++i) {
-    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
-                                                 color_order);
-      color_idx = vpx_read_tree(r, vp10_palette_color_tree[n - 2],
-                                prob[n - 2][color_ctx]);
-      assert(color_idx >= 0 && color_idx < n);
-      color_map[i * cols + j] = color_order[color_idx];
-    }
-  }
-}
-
 int vp10_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,
diff --git a/vp10/decoder/detokenize.h b/vp10/decoder/detokenize.h
index d2677f6..c3fd90a 100644
--- a/vp10/decoder/detokenize.h
+++ b/vp10/decoder/detokenize.h
@@ -20,8 +20,6 @@
 extern "C" {
 #endif
 
-void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
-                                vpx_reader *r);
 int vp10_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index d9b4be4..5f6d9d3 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -44,20 +44,6 @@
   {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
 static const struct vp10_token inter_mode_encodings[INTER_MODES] =
   {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
-static const struct vp10_token palette_size_encodings[] = {
-    {0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6},
-};
-static const struct vp10_token
-palette_color_encodings[PALETTE_MAX_SIZE - 1][PALETTE_MAX_SIZE] = {
-    {{0, 1}, {1, 1}},  // 2 colors
-    {{0, 1}, {2, 2}, {3, 2}},  // 3 colors
-    {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // 4 colors
-    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {15, 4}},  // 5 colors
-    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {31, 5}},  // 6 colors
-    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6}},  // 7 colors
-    {{0, 1}, {2, 2}, {6, 3}, {14, 4},
-        {30, 5}, {62, 6}, {126, 7}, {127, 7}},  // 8 colors
-};
 
 static INLINE void write_uniform(vpx_writer *w, int n, int v) {
   int l = get_unsigned_bits(n);
@@ -147,22 +133,6 @@
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_palette_tokens(vpx_writer *w, TOKENEXTRA **tp,
-                                BLOCK_SIZE bsize, int n) {
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-  int i;
-  TOKENEXTRA *p = *tp;
-
-  for (i = 0; i < rows * cols -1; ++i) {
-    vp10_write_token(w, vp10_palette_color_tree[n - 2], p->context_tree,
-                     &palette_color_encodings[n - 2][p->token]);
-    ++p;
-  }
-
-  *tp = p;
-}
-
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
@@ -402,36 +372,6 @@
   }
 }
 
-static void write_palette_mode_info(const VP10_COMMON *cm,
-                                    const MACROBLOCKD *xd,
-                                    const MODE_INFO *const mi,
-                                    vpx_writer *w) {
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-  int palette_ctx = 0;
-  int n, i;
-
-  n = pmi->palette_size[0];
-  if (above_mi)
-    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  vpx_write(w, n > 0,
-            vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
-  if (n > 0) {
-    vp10_write_token(w, vp10_palette_size_tree,
-                     vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
-                     &palette_size_encodings[n - 2]);
-    for (i = 0; i < n; ++i)
-      vpx_write_literal(w, pmi->palette_colors[i],
-                        cm->bit_depth);
-    write_uniform(w, n, pmi->palette_first_color_idx[0]);
-  }
-}
-
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                               MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
@@ -472,10 +412,6 @@
   }
 
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
-
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
-      mbmi->mode == DC_PRED)
-    write_palette_mode_info(cm, xd, mi, w);
 }
 
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
@@ -502,13 +438,6 @@
     pack_inter_mode_mvs(cpi, m, w);
   }
 
-  if (m->mbmi.palette_mode_info.palette_size[0] > 0) {
-    assert(*tok < tok_end);
-    pack_palette_tokens(w, tok, m->mbmi.sb_type,
-                        m->mbmi.palette_mode_info.palette_size[0]);
-    assert(*tok < tok_end);
-  }
-
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
@@ -1259,8 +1188,6 @@
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
-    if (frame_is_intra_only(cm))
-      vpx_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
     if (!cm->show_frame)
       vpx_wb_write_bit(wb, cm->intra_only);
@@ -1546,7 +1473,7 @@
     assert(n_log2_tiles > 0);
     vpx_wb_write_literal(&saved_wb, mag, 2);
     if (mag < 3)
-      data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
   } else {
     assert(n_log2_tiles == 0);
   }
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index df3830c..ab0252b 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -52,13 +52,6 @@
   uint8_t mode_context[MAX_REF_FRAMES];
 } MB_MODE_INFO_EXT;
 
-typedef struct {
-  uint8_t best_palette_color_map[4096];
-  double kmeans_data_buf[4096];
-  uint8_t kmeans_indices_buf[4096];
-  uint8_t kmeans_pre_indices_buf[4096];
-} PALETTE_BUFFER;
-
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -77,6 +70,8 @@
   int rddiv;
   int rdmult;
   int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
@@ -99,8 +94,6 @@
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
-  PALETTE_BUFFER *palette_buffer;
-
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index c61babe..132a141 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -22,7 +22,10 @@
 
 static INLINE void range_check(const tran_low_t *input, const int size,
                                const int bit) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
+// TODO(angiebird): the range_check is not used because the bit range
+// in fdct# is not correct. Since we are going to merge in a new version
+// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
   int i;
   for (i = 0; i < size; ++i) {
     assert(abs(input[i]) < (1 << bit));
@@ -39,7 +42,7 @@
   tran_low_t step[4];
 
   // stage 0
-  range_check(input, 4, 11);
+  range_check(input, 4, 14);
 
   // stage 1
   output[0] = input[0] + input[3];
@@ -47,7 +50,7 @@
   output[2] = input[1] - input[2];
   output[3] = input[0] - input[3];
 
-  range_check(output, 4, 12);
+  range_check(output, 4, 15);
 
   // stage 2
   temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
@@ -59,7 +62,7 @@
   temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
   step[3] = (tran_low_t)fdct_round_shift(temp);
 
-  range_check(step, 4, 13);
+  range_check(step, 4, 16);
 
   // stage 3
   output[0] = step[0];
@@ -67,7 +70,7 @@
   output[2] = step[1];
   output[3] = step[3];
 
-  range_check(output, 4, 13);
+  range_check(output, 4, 16);
 }
 
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
@@ -75,7 +78,7 @@
   tran_low_t step[8];
 
   // stage 0
-  range_check(input, 8, 12);
+  range_check(input, 8, 13);
 
   // stage 1
   output[0] = input[0] + input[7];
@@ -87,7 +90,7 @@
   output[6] = input[1] - input[6];
   output[7] = input[0] - input[7];
 
-  range_check(output, 8, 13);
+  range_check(output, 8, 14);
 
   // stage 2
   step[0] = output[0] + output[3];
@@ -101,7 +104,7 @@
   step[6] = (tran_low_t)fdct_round_shift(temp);
   step[7] = output[7];
 
-  range_check(step, 8, 14);
+  range_check(step, 8, 15);
 
   // stage 3
   temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
@@ -117,7 +120,7 @@
   output[6] = step[7] - step[6];
   output[7] = step[7] + step[6];
 
-  range_check(output, 8, 14);
+  range_check(output, 8, 16);
 
   // stage 4
   step[0] = output[0];
@@ -133,7 +136,7 @@
   temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
   step[7] = (tran_low_t)fdct_round_shift(temp);
 
-  range_check(step, 8, 14);
+  range_check(step, 8, 16);
 
   // stage 5
   output[0] = step[0];
@@ -145,7 +148,7 @@
   output[6] = step[3];
   output[7] = step[7];
 
-  range_check(output, 8, 14);
+  range_check(output, 8, 16);
 }
 
 static void fdct16(const tran_low_t *input, tran_low_t *output) {
@@ -322,7 +325,7 @@
   range_check(output, 16, 16);
 }
 
-/* #TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index f303b01..44ca276 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1140,15 +1140,6 @@
     p[i].eobs = ctx->eobs_pbuf[i][0];
   }
 
-  if (cm->current_video_frame == 0 && cm->allow_screen_content_tools) {
-    for (i = 0; i < 2; ++i) {
-      if (ctx->color_index_map[i] == 0) {
-        CHECK_MEM_ERROR(cm, ctx->color_index_map[i],
-                        vpx_memalign(16, (ctx->num_4x4_blk << 4) *
-                                     sizeof(*ctx->color_index_map[i])));
-      }
-    }
-  }
   for (i = 0; i < 2; ++i)
     pd[i].color_index_map = ctx->color_index_map[i];
 
@@ -1164,7 +1155,7 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance =
         vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
-                                           bsize, xd->bd);
+                                            bsize, xd->bd);
   } else {
     x->source_variance =
       vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
@@ -2588,7 +2579,7 @@
 }
 
 static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
-  if (!cpi->common.seg.enabled && xd->lossless[0])
+  if (xd->lossless[0])
     return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
@@ -2651,6 +2642,10 @@
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -2704,17 +2699,15 @@
   vp10_zero(rdc->coef_counts);
   vp10_zero(rdc->comp_pred_diff);
   vp10_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
@@ -2984,16 +2977,6 @@
     if (output_enabled)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
-
-    if (bsize >= BLOCK_8X8 && output_enabled) {
-      if (mbmi->palette_mode_info.palette_size[0] > 0) {
-        mbmi->palette_mode_info.palette_first_color_idx[0] =
-            xd->plane[0].color_index_map[0];
-        // TODO(huisu): this increases the use of token buffer. Needs stretch
-        // test to verify.
-        vp10_tokenize_palette_sb(td, bsize, 0, t);
-      }
-    }
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
   } else {
     int ref;
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 5b646a2..6bba848 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -390,9 +390,6 @@
 
   vp10_free_pc_tree(&cpi->td);
 
-  if (cpi->common.allow_screen_content_tools)
-    vpx_free(cpi->td.mb.palette_buffer);
-
   if (cpi->source_diff_var != NULL) {
     vpx_free(cpi->source_diff_var);
     cpi->source_diff_var = NULL;
@@ -1435,15 +1432,6 @@
                                              : REFRESH_FRAME_CONTEXT_BACKWARD;
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
-  cm->allow_screen_content_tools = (cpi->oxcf.content == VP9E_CONTENT_SCREEN);
-  if (cm->allow_screen_content_tools) {
-    MACROBLOCK *x = &cpi->td.mb;
-    if (x->palette_buffer == 0) {
-      CHECK_MEM_ERROR(cm, x->palette_buffer,
-                      vpx_memalign(16, sizeof(*x->palette_buffer)));
-    }
-  }
-
   vp10_reset_segment_features(cm);
   vp10_set_high_precision_mv(cpi, 0);
 
@@ -1949,8 +1937,6 @@
 
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
-      if (cpi->common.allow_screen_content_tools)
-        vpx_free(thread_data->td->mb.palette_buffer);
       vpx_free(thread_data->td->counts);
       vp10_free_pc_tree(thread_data->td);
       vpx_free(thread_data->td);
@@ -2857,7 +2843,7 @@
   recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -2866,6 +2852,8 @@
         "%10lf %8u %10"PRId64" %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 7028803..2a44e47 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -250,6 +250,8 @@
   vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -460,12 +462,6 @@
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
-  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_y_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
-                                                 [PALETTE_COLORS];
-  int palette_uv_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
-                                                  [PALETTE_COLORS];
 
   int multi_arf_allowed;
   int multi_arf_enabled;
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index e20d532..ad47ccf 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -30,6 +30,11 @@
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
@@ -128,13 +133,6 @@
       memcpy(thread_data->td->counts, &cpi->common.counts,
              sizeof(cpi->common.counts));
     }
-
-    // Allocate buffers used by palette coding mode.
-    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
-        MACROBLOCK *x = &thread_data->td->mb;
-        CHECK_MEM_ERROR(cm, x->palette_buffer,
-                        vpx_memalign(16, sizeof(*x->palette_buffer)));
-    }
   }
 
   // Encode a frame
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index d6ab00f..04e1daf 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -1523,69 +1523,83 @@
 
 #undef CHECK_BETTER
 
-int vp10_full_range_search_c(const MACROBLOCK *x,
-                            const search_site_config *cfg,
-                            MV *ref_mv, MV *best_mv,
-                            int search_param, int sad_per_bit, int *num00,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv) {
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int range = 64;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV fcenter_mv = {center_mv->row, center_mv->col};
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
 
-  // The cfg and search_param parameters are not used in this search variant
-  (void)cfg;
-  (void)search_param;
+  assert(step >= 1);
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  *best_mv = *ref_mv;
-  *num00 = 11;
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
-  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
 
-  for (r = start_row; r <= end_row; ++r) {
-    for (c = start_col; c <= end_col; c += 4) {
-      if (c + 3 <= end_col) {
-        unsigned int sads[4];
-        const uint8_t *addrs[4];
-        for (i = 0; i < 4; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          addrs[i] = get_buf_from_mv(in_what, &mv);
-        }
-
-        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
-
-        for (i = 0; i < 4; ++i) {
-          if (sads[i] < best_sad) {
-            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-            const unsigned int sad = sads[i] +
-                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
           }
         }
       } else {
-        for (i = 0; i < end_col - c; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
             if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
             }
           }
         }
@@ -2014,6 +2028,70 @@
   return bestsme;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
 int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2327,6 +2405,18 @@
   return best_sad;
 }
 
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
 int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
@@ -2345,6 +2435,9 @@
     cost_list[4] = INT_MAX;
   }
 
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
   switch (method) {
     case FAST_DIAMOND:
       var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2370,6 +2463,29 @@
       var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                    MAX_MVSEARCH_STEPS - 1 - step_param,
                                    1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
       break;
     default:
       assert(0 && "Invalid search method.");
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
deleted file mode 100644
index 522e185..0000000
--- a/vp10/encoder/palette.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include "vp10/encoder/palette.h"
-
-static double calc_dist(const double *p1, const double *p2, int dim) {
-  double dist = 0;
-  int i = 0;
-
-  for (i = 0; i < dim; ++i) {
-    dist = dist + (p1[i] - round(p2[i])) * (p1[i] - round(p2[i]));
-  }
-  return dist;
-}
-
-void vp10_calc_indices(const double *data, const double *centroids,
-                       uint8_t *indices, int n, int k, int dim) {
-  int i, j;
-  double min_dist, this_dist;
-
-  for (i = 0; i < n; ++i) {
-    min_dist = calc_dist(data + i * dim, centroids, dim);
-    indices[i] = 0;
-    for (j = 1; j < k; ++j) {
-      this_dist = calc_dist(data + i * dim, centroids + j * dim, dim);
-      if (this_dist < min_dist) {
-        min_dist = this_dist;
-        indices[i] = j;
-      }
-    }
-  }
-}
-
-// Generate a random number in the range [0, 32768).
-static unsigned int lcg_rand16(unsigned int *state) {
-  *state = *state * 1103515245 + 12345;
-  return *state / 65536 % 32768;
-}
-
-static void calc_centroids(const double *data, double *centroids,
-                           const uint8_t *indices, int n, int k, int dim) {
-  int i, j, index;
-  int count[PALETTE_MAX_SIZE];
-  unsigned int rand_state = (unsigned int)data[0];
-
-  assert(n <= 32768);
-
-  memset(count, 0, sizeof(count[0]) * k);
-  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
-
-  for (i = 0; i < n; ++i) {
-    index = indices[i];
-    assert(index < k);
-    ++count[index];
-    for (j = 0; j < dim; ++j) {
-      centroids[index * dim + j] += data[i * dim + j];
-    }
-  }
-
-  for (i = 0; i < k; ++i) {
-    if (count[i] == 0) {
-      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
-                 sizeof(centroids[0]) * dim);
-    } else {
-      const double norm = 1.0 / count[i];
-      for (j = 0; j < dim; ++j)
-        centroids[i * dim + j] *= norm;
-    }
-  }
-}
-
-static double calc_total_dist(const double *data, const double *centroids,
-                              const uint8_t *indices, int n, int k, int dim) {
-  double dist = 0;
-  int i;
-  (void) k;
-
-  for (i = 0; i < n; ++i)
-    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
-
-  return dist;
-}
-
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
-                 uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
-  int i = 0;
-  double pre_dist, this_dist;
-  double pre_centroids[PALETTE_MAX_SIZE];
-
-  vp10_calc_indices(data, centroids, indices, n, k, dim);
-  pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
-  memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
-  memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
-  while (i < max_itr) {
-    calc_centroids(data, centroids, indices, n, k, dim);
-    vp10_calc_indices(data, centroids, indices, n, k, dim);
-    this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
-
-    if (this_dist > pre_dist) {
-      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
-      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
-      break;
-    }
-    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
-      break;
-
-    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
-    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
-    pre_dist = this_dist;
-    ++i;
-  }
-
-  return i;
-}
-
-void vp10_insertion_sort(double *data, int n) {
-  int i, j, k;
-  double val;
-
-  if (n <= 1)
-    return;
-
-  for (i = 1; i < n; ++i) {
-    val = data[i];
-    j = 0;
-    while (val > data[j] && j < i)
-      ++j;
-
-    if (j == i)
-      continue;
-
-    for (k = i; k > j; --k)
-      data[k] = data[k - 1];
-    data[j] = val;
-  }
-}
-
-int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols) {
-  int n = 0, r, c, i, val_count[256];
-  uint8_t val;
-  memset(val_count, 0, sizeof(val_count));
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < 256; ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
-                             int cols, int bit_depth) {
-  int n = 0, r, c, i;
-  uint16_t val;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int val_count[1 << 12];
-
-  assert(bit_depth <= 12);
-  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < (1 << bit_depth); ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-
diff --git a/vp10/encoder/palette.h b/vp10/encoder/palette.h
deleted file mode 100644
index 124cf74..0000000
--- a/vp10/encoder/palette.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP10_ENCODER_PALETTE_H_
-#define VP10_ENCODER_PALETTE_H_
-
-#include "vp10/common/blockd.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp10_insertion_sort(double *data, int n);
-void vp10_calc_indices(const double *data, const double *centroids,
-                       uint8_t *indices, int n, int k, int dim);
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
-                 uint8_t *pre_indices, int n, int k, int dim, int max_itr);
-int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols);
-#if CONFIG_VP9_HIGHBITDEPTH
-int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
-                             int cols, int bit_depth);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif /* VP10_ENCODER_PALETTE_H_ */
diff --git a/vp10/encoder/ratectrl.c b/vp10/encoder/ratectrl.c
index d4c25c0..3ff2476 100644
--- a/vp10/encoder/ratectrl.c
+++ b/vp10/encoder/ratectrl.c
@@ -1210,8 +1210,12 @@
     rc->frames_since_golden = 0;
 
     // If we are not using alt ref in the up and coming group clear the arf
-    // active flag.
-    if (!rc->source_alt_ref_pending) {
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
 
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 4ed1ae2..5623a72 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -83,25 +83,6 @@
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp10_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
-
-  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
-    vp10_cost_tokens(cpi->palette_y_size_cost[i],
-                     vp10_default_palette_y_size_prob[i],
-                     vp10_palette_size_tree);
-    vp10_cost_tokens(cpi->palette_uv_size_cost[i],
-                     vp10_default_palette_uv_size_prob[i],
-                     vp10_palette_size_tree);
-  }
-
-  for (i = 0; i < PALETTE_MAX_SIZE - 1; ++i)
-    for (j = 0; j < PALETTE_COLOR_CONTEXTS; ++j) {
-      vp10_cost_tokens(cpi->palette_y_color_cost[i][j],
-                       vp10_default_palette_y_color_prob[i][j],
-                       vp10_palette_color_tree[i]);
-      vp10_cost_tokens(cpi->palette_uv_color_cost[i][j],
-                       vp10_default_palette_uv_color_prob[i][j],
-                       vp10_palette_color_tree[i]);
-    }
 }
 
 static void fill_token_costs(vp10_coeff_cost *c,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index d87d164..bba2171 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -36,7 +36,6 @@
 #include "vp10/encoder/encodemv.h"
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/mcomp.h"
-#include "vp10/encoder/palette.h"
 #include "vp10/encoder/quantize.h"
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/rd.h"
@@ -760,163 +759,6 @@
   return 0;
 }
 
-void rd_pick_palette_intra_sby(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                               int palette_ctx, int dc_mode_cost,
-                               PALETTE_MODE_INFO *palette_mode_info,
-                               uint8_t *best_palette_color_map,
-                               TX_SIZE *best_tx, PREDICTION_MODE *mode_selected,
-                               int64_t *best_rd) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0];
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-  int this_rate, this_rate_tokenonly, s;
-  int64_t this_distortion, this_rd;
-  int colors, n;
-  int src_stride = x->plane[0].src.stride;
-  uint8_t *src = x->plane[0].src.buf;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (cpi->common.use_highbitdepth)
-    colors = vp10_count_colors_highbd(src, src_stride, rows, cols,
-                                      cpi->common.bit_depth);
-  else
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    colors = vp10_count_colors(src, src_stride, rows, cols);
-  palette_mode_info->palette_size[0] = 0;
-
-  if (colors > 1 && colors <= 64 && cpi->common.allow_screen_content_tools) {
-    int r, c, i, j, k;
-    int max_itr = 50;
-    int color_ctx, color_idx = 0;
-    int color_order[PALETTE_MAX_SIZE];
-    double *data = x->palette_buffer->kmeans_data_buf;
-    uint8_t *indices = x->palette_buffer->kmeans_indices_buf;
-    uint8_t *pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
-    double centroids[PALETTE_MAX_SIZE];
-    uint8_t *color_map;
-    double lb, ub, val;
-    PALETTE_MODE_INFO *pmi = &mic->mbmi.palette_mode_info;
-#if CONFIG_VP9_HIGHBITDEPTH
-    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    if (cpi->common.use_highbitdepth)
-      lb = ub = src16[0];
-    else
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      lb = ub = src[0];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (cpi->common.use_highbitdepth) {
-      for (r = 0; r < rows; ++r) {
-        for (c = 0; c < cols; ++c) {
-          val = src16[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-    } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      for (r = 0; r < rows; ++r) {
-        for (c = 0; c < cols; ++c) {
-          val = src[r * src_stride + c];
-          data[r * cols + c] = val;
-          if (val < lb)
-            lb = val;
-          else if (val > ub)
-            ub = val;
-        }
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-    }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    mic->mbmi.mode = DC_PRED;
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
-        n >= 2; --n) {
-      for (i = 0; i < n; ++i)
-        centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-      vp10_k_means(data, centroids, indices, pre_indices, rows * cols,
-                   n, 1, max_itr);
-      vp10_insertion_sort(centroids, n);
-      for (i = 0; i < n; ++i)
-        centroids[i] = round(centroids[i]);
-      // remove duplicates
-      i = 1;
-      k = n;
-      while (i < k) {
-        if (centroids[i] == centroids[i - 1]) {
-          j = i;
-          while (j < k - 1) {
-            centroids[j] = centroids[j + 1];
-            ++j;
-          }
-          --k;
-        } else {
-          ++i;
-        }
-      }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cpi->common.use_highbitdepth)
-        for (i = 0; i < k; ++i)
-          mic->mbmi.palette_mode_info.palette_colors[i] =
-              clip_pixel_highbd(round(centroids[i]), cpi->common.bit_depth);
-      else
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel((int)round(centroids[i]));
-      pmi->palette_size[0] = k;
-
-      vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
-      for (r = 0; r < rows; ++r)
-        for (c = 0; c < cols; ++c)
-          xd->plane[0].color_index_map[r * cols + c] = indices[r * cols + c];
-
-      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                      &s, NULL, bsize, *best_rd);
-      if (this_rate_tokenonly == INT_MAX)
-        continue;
-
-      this_rate = this_rate_tokenonly + dc_mode_cost +
-          cpi->common.bit_depth * k * vp10_cost_bit(128, 0) +
-          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - 2];
-      this_rate +=
-          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                         [palette_ctx], 1);
-      color_map = xd->plane[0].color_index_map;
-      this_rate +=  write_uniform_cost(k, xd->plane[0].color_index_map[0]);
-      for (i = 0; i < rows; ++i) {
-        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          color_ctx = vp10_get_palette_color_context(color_map, cols, i, j,
-                                                     k, color_order);
-          for (r = 0; r < k; ++r)
-            if (color_map[i * cols + j] == color_order[r]) {
-              color_idx = r;
-              break;
-            }
-          assert(color_idx < k);
-          this_rate +=
-              cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
-        }
-      }
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        *palette_mode_info = mic->mbmi.palette_mode_info;
-        memcpy(best_palette_color_map, xd->plane[0].color_index_map,
-               rows * cols * sizeof(xd->plane[0].color_index_map[0]));
-        *mode_selected = DC_PRED;
-        *best_tx = mic->mbmi.tx_size;
-      }
-    }
-  }
-}
-
 static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
                                      int row, int col,
                                      PREDICTION_MODE *best_mode,
@@ -947,7 +789,6 @@
   memcpy(ta, a, sizeof(ta));
   memcpy(tl, l, sizeof(tl));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
-  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1225,12 +1066,6 @@
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
   int *bmode_costs;
-  PALETTE_MODE_INFO palette_mode_info;
-  uint8_t *best_palette_color_map = cpi->common.allow_screen_content_tools ?
-      x->palette_buffer->best_palette_color_map : NULL;
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-  int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
@@ -1238,12 +1073,6 @@
   bmode_costs = cpi->y_mode_costs[A][L];
 
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
-  palette_mode_info.palette_size[0] = 0;
-  mic->mbmi.palette_mode_info.palette_size[0] = 0;
-  if (above_mi)
-    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
 
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
@@ -1256,10 +1085,6 @@
       continue;
 
     this_rate = this_rate_tokenonly + bmode_costs[mode];
-    if (cpi->common.allow_screen_content_tools && mode == DC_PRED)
-      this_rate +=
-          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                         [palette_ctx], 0);
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -1273,22 +1098,8 @@
     }
   }
 
-  if (cpi->common.allow_screen_content_tools)
-    rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx, bmode_costs[DC_PRED],
-                              &palette_mode_info, best_palette_color_map,
-                              &best_tx, &mode_selected, &best_rd);
-
   mic->mbmi.mode = mode_selected;
   mic->mbmi.tx_size = best_tx;
-  mic->mbmi.palette_mode_info.palette_size[0] =
-      palette_mode_info.palette_size[0];
-  if (palette_mode_info.palette_size[0] > 0) {
-    memcpy(mic->mbmi.palette_mode_info.palette_colors,
-           palette_mode_info.palette_colors,
-           PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
-    memcpy(xd->plane[0].color_index_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
-  }
 
   return best_rd;
 }
@@ -1359,7 +1170,6 @@
   int64_t this_distortion, this_sse;
 
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
-  xd->mi[0]->mbmi.palette_mode_info.palette_size[1] = 0;
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
@@ -1973,7 +1783,7 @@
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int thissme, bestsme = INT_MAX;
+          int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
@@ -2028,27 +1838,6 @@
               &bsi->ref_mv[0]->as_mv, new_mv,
               INT_MAX, 1);
 
-          // Should we do a full search (best quality only)
-          if (cpi->oxcf.mode == BEST) {
-            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, &cpi->fn_ptr[bsize],
-                                           &bsi->ref_mv[0]->as_mv,
-                                           &best_mv->as_mv);
-            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              *new_mv = best_mv->as_mv;
-            } else {
-              // The full search result is actually worse so re-instate the
-              // previous best vector
-              best_mv->as_mv = *new_mv;
-            }
-          }
-
           if (bestsme < INT_MAX) {
             int distortion;
             cpi->find_fractional_mv_step(
@@ -3292,8 +3081,6 @@
     midx = end_pos;
   }
 
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -3773,8 +3560,6 @@
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
@@ -3922,9 +3707,6 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index d40383f..ce0aebe 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -16,6 +16,23 @@
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
+
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP10_COMP *cpi) {
@@ -251,6 +268,8 @@
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -498,8 +517,36 @@
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
   cpi->full_search_sad = vp10_full_search_sad;
-  cpi->diamond_search_sad = oxcf->mode == BEST ? vp10_full_range_search
-                                               : vp10_diamond_search_sad;
+  cpi->diamond_search_sad = vp10_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 3969a2f..3b91999 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -195,6 +195,13 @@
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -290,6 +297,18 @@
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
   int schedule_mode_search;
 
   // Allows sub 8x8 modes to use the prediction filter that was determined
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index e568c0b..1b94190 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -487,39 +487,6 @@
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-void vp10_tokenize_palette_sb(struct ThreadData *const td,
-                              BLOCK_SIZE bsize, int plane,
-                              TOKENEXTRA **t) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  uint8_t *color_map = xd->plane[0].color_index_map;
-  PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-  int n = pmi->palette_size[plane != 0];
-  int i, j, k;
-  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
-  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
-  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-
-  for (i = 0; i < rows; ++i) {
-    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
-                                                 color_order);
-      for (k = 0; k < n; ++k)
-        if (color_map[i * cols + j] == color_order[k]) {
-          color_new_idx = k;
-          break;
-        }
-      assert(color_new_idx >= 0 && color_new_idx < n);
-
-      (*t)->token = color_new_idx;
-      (*t)->context_tree = vp10_default_palette_y_color_prob[n - 2][color_ctx];
-      (*t)->skip_eob_node = 0;
-      ++(*t);
-    }
-  }
-}
-
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
                        BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
diff --git a/vp10/encoder/tokenize.h b/vp10/encoder/tokenize.h
index 1614add..5bad415 100644
--- a/vp10/encoder/tokenize.h
+++ b/vp10/encoder/tokenize.h
@@ -51,9 +51,6 @@
 struct VP10_COMP;
 struct ThreadData;
 
-void vp10_tokenize_palette_sb(struct ThreadData *const td,
-                              BLOCK_SIZE bsize, int plane,
-                              TOKENEXTRA **t);
 void vp10_tokenize_sb(struct VP10_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
 
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 7b77f31..ead993a 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -52,8 +52,6 @@
 VP10_CX_SRCS-yes += encoder/treewriter.h
 VP10_CX_SRCS-yes += encoder/mcomp.c
 VP10_CX_SRCS-yes += encoder/encoder.c
-VP10_CX_SRCS-yes += encoder/palette.h
-VP10_CX_SRCS-yes += encoder/palette.c
 VP10_CX_SRCS-yes += encoder/picklpf.c
 VP10_CX_SRCS-yes += encoder/picklpf.h
 VP10_CX_SRCS-yes += encoder/quantize.c
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 8b55dff..756ad48 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -141,8 +141,8 @@
             else  /* Delta Value */
             {
                 lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
             }
+            lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
         }
 
         if (!mbd->mode_ref_lf_delta_enabled)
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index f0d7603..4bc87eb 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -73,10 +73,9 @@
 
         /* Delta Value */
         else
-        {
             QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
-        }
+
+        QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
     }
     else
         QIndex = pc->base_qindex;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index d12cd76..1b42014 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -174,6 +174,9 @@
   else if (eob <= 34)
     // non-zero coeff only in upper-left 8x8
     vpx_idct32x32_34_add(input, dest, stride);
+  else if (eob <= 135)
+    // non-zero coeff only in upper-left 16x16
+    vpx_idct32x32_135_add(input, dest, stride);
   else
     vpx_idct32x32_1024_add(input, dest, stride);
 }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8fe6503..d166bbf 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@
 specialize qw/vp9_avg_8x8 sse2 neon msa/;
 
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
+specialize qw/vp9_avg_4x4 sse2 neon msa/;
 
 add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
 specialize qw/vp9_minmax_8x8 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index 5996bd4..78467ce 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -24,6 +24,18 @@
   return vget_lane_u32(c, 0);
 }
 
+unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
 unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
   uint8x8_t v_s0 = vld1_u8(s);
   const uint8x8_t v_s1 = vld1_u8(s + p);
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index fc76c11..93aa40a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -194,7 +194,8 @@
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
                                                          int motion_magnitude,
-                                                         int is_skin) {
+                                                         int is_skin,
+                                                         int *zeromv_filter) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
@@ -237,6 +238,7 @@
     mbmi->mv[0].as_int = 0;
     ctx->best_sse_inter_mode = ZEROMV;
     ctx->best_sse_mv.as_int = 0;
+    *zeromv_filter = 1;
   }
 
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -320,6 +322,7 @@
                           VP9_DENOISER_DECISION *denoiser_decision) {
   int mv_col, mv_row;
   int motion_magnitude = 0;
+  int zeromv_filter = 0;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -360,7 +363,8 @@
                                            denoiser->increase_denoising,
                                            mi_row, mi_col, ctx,
                                            motion_magnitude,
-                                           is_skin);
+                                           is_skin,
+                                           &zeromv_filter);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,
@@ -382,6 +386,8 @@
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
   *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
 }
 
 static void copy_frame(YV12_BUFFER_CONFIG * const dest,
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index c8c9352..d07056b 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -23,7 +23,8 @@
 
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
-  FILTER_BLOCK
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
 } VP9_DENOISER_DECISION;
 
 typedef enum vp9_denoiser_level {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 7e56989..c643b18 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -401,7 +401,6 @@
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
 
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
@@ -414,7 +413,7 @@
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME)
       get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
@@ -425,7 +424,7 @@
     return 0;
   } else if (bsize > bsize_min) {
     // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME)
       get_variance(&vt.part_variances->none);
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
@@ -489,13 +488,14 @@
     thresholds[2] = threshold_base >> 2;
     thresholds[3] = threshold_base << 2;
   } else {
-    // Increase base variance threshold if estimated noise level is high.
+    // Increase base variance threshold based on  estimated noise level.
     if (cpi->noise_estimate.enabled) {
-      if (cpi->noise_estimate.level == kHigh)
+      NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level(
+          &cpi->noise_estimate);
+      if (noise_level == kHigh)
         threshold_base = 3 * threshold_base;
-      else
-        if (cpi->noise_estimate.level == kMedium)
-          threshold_base = threshold_base << 1;
+      else if (noise_level == kMedium)
+        threshold_base = threshold_base << 1;
     }
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 3;
@@ -843,7 +843,8 @@
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
+        } else if (cpi->oxcf.speed < 8 &&
+                   vt.split[i].split[j].part_variances.none.variance >
                    thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
           // We have some nominal amount of 16x16 variance (based on average),
@@ -896,6 +897,14 @@
         for (m = 0; m < 4; m++)
           fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
         fill_variance_tree(vtemp, BLOCK_16X16);
+        // If variance of this 16x16 block is above the threshold, force block
+        // to split. This also forces a split on the upper levels.
+        get_variance(&vtemp->part_variances.none);
+        if (vtemp->part_variances.none.variance > thresholds[2]) {
+          force_split[5 + i2 + j] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        }
       }
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
@@ -2422,8 +2431,15 @@
 
   if (cpi->sf.use_square_partition_only &&
       bsize > cpi->sf.use_square_only_threshold) {
+    if (cpi->use_svc) {
+      if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_horz_allowed &= force_horz_split;
+      if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_vert_allowed &= force_vert_split;
+    } else {
       partition_horz_allowed &= force_horz_split;
       partition_vert_allowed &= force_vert_split;
+    }
   }
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index a57cf87..72fa828 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3682,12 +3682,16 @@
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (use_normative_scaler)
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
       scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
 #else
-    if (use_normative_scaler)
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
       scale_and_extend_frame(unscaled, scaled);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled);
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index b41ffd0..4befbb0 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -25,7 +25,7 @@
                              int width,
                              int height) {
   ne->enabled = 0;
-  ne->level = kLow;
+  ne->level = kLowLow;
   ne->value = 0;
   ne->count = 0;
   ne->thresh = 90;
@@ -82,6 +82,21 @@
   }
 }
 
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > (ne->thresh >> 1))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
 void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
@@ -220,22 +235,16 @@
         // Reset counter and check noise level condition.
         ne->num_frames_estimate = 30;
         ne->count = 0;
-        if (ne->value > (ne->thresh << 1))
-          ne->level = kHigh;
-        else
-          if (ne->value > ne->thresh)
-            ne->level = kMedium;
-          else if (ne->value > (ne->thresh >> 1))
-            ne->level = kLow;
-          else
-            ne->level = kLowLow;
+        ne->level = vp9_noise_estimate_extract_level(ne);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity > 0)
+          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
       }
     }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0)
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
-    vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
-  }
 #endif
 }
diff --git a/vp9/encoder/vp9_noise_estimate.h b/vp9/encoder/vp9_noise_estimate.h
index 0d22ef0..826d125 100644
--- a/vp9/encoder/vp9_noise_estimate.h
+++ b/vp9/encoder/vp9_noise_estimate.h
@@ -47,6 +47,8 @@
                              int width,
                              int height);
 
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
 void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 02e9674..90650db 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1696,15 +1696,14 @@
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
     vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
                          VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
-    // If INTRA mode was selected, re-evaluate ZEROMV on denoised result.
-    // Only do this under noise conditions, and if rdcost of ZEROMV on
-    // original source is not significantly higher than rdcost of INTRA MODE.
-    if (best_ref_frame == INTRA_FRAME &&
-        decision == FILTER_BLOCK &&
+    // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
+    // result. Only do this under noise conditions, and if rdcost of ZEROMV on
+    // original source is not significantly higher than rdcost of best mode.
+    if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+        (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) &&
         cpi->noise_estimate.enabled &&
         cpi->noise_estimate.level > kLow &&
-        zero_last_cost_orig < (best_rdc.rdcost << 2) &&
-        !reuse_inter_pred) {
+        zero_last_cost_orig < (best_rdc.rdcost << 3)) {
       // Check if we should pick ZEROMV on denoised signal.
       int rate = 0;
       int64_t dist = 0;
@@ -1713,6 +1712,7 @@
       mbmi->ref_frame[1] = NONE;
       mbmi->mv[0].as_int = 0;
       mbmi->interp_filter = EIGHTTAP;
+      xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
       model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
       this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] +
@@ -1721,13 +1721,21 @@
       this_rdc.dist = dist;
       this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
       // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
-      // is lower than INTRA (on original source).
+      // is lower than best_ref mode (on original source).
       if (this_rdc.rdcost > best_rdc.rdcost) {
         this_rdc = best_rdc;
         mbmi->mode = best_mode;
         mbmi->ref_frame[0] = best_ref_frame;
-        mbmi->mv[0].as_int = INVALID_MV;
         mbmi->interp_filter = best_pred_filter;
+        if (best_ref_frame == INTRA_FRAME)
+          mbmi->mv[0].as_int = INVALID_MV;
+        else if (best_ref_frame == GOLDEN_FRAME) {
+          mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+          if (reuse_inter_pred) {
+            xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+            vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+          }
+        }
         mbmi->tx_size = best_tx_size;
         x->skip_txfm[0] = best_mode_skip_txfm;
       } else {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index fdff363..8ab51cd 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1075,7 +1075,7 @@
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
         // Modify best quality for second level arfs. For mode VPX_Q this
         // becomes the baseline frame q.
@@ -1257,8 +1257,12 @@
     rc->frames_since_golden = 0;
 
     // If we are not using alt ref in the up and coming group clear the arf
-    // active flag.
-    if (!rc->source_alt_ref_pending) {
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 20516a0..318d810 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -380,16 +380,6 @@
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
-    // TODO(marpan): When denoising, we may re-evaluate the mode selection and
-    // this seems to cause problems when reuse_inter_pred_sby is enabled.
-    // Disabling reuse_inter_pred_sby for now (under denoising conditions), and
-    // will look into re-enabling it.
-#if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 &&
-        cpi->noise_estimate.enabled &&
-        cpi->noise_estimate.level > kLow)
-      sf->reuse_inter_pred_sby = 0;
-#endif
     sf->partition_search_breakout_rate_thr = 200;
     sf->coeff_prob_appx_step = 4;
     sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 13da155..b0617c1 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -279,7 +279,7 @@
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
-      cpi->svc.number_spatial_layers > 1) {
+      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c85..015dbc0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -135,15 +135,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -182,15 +205,34 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
       modifier += rounding;
       modifier >>= strength;
 
@@ -383,55 +425,58 @@
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
-            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
-            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength,
+                                               filter_weight, accumulator + 256,
+                                               count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          // TODO(jingning): Need SIMD optimization for this.
+          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                       predictor, 16, 16,
                                       strength, filter_weight,
                                       accumulator, count);
-            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 256,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 256,
                                       count + 256);
-            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 512,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 512,
                                       count + 512);
-          }
-#else
-          // Apply the filter (YUV)
-          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16,
-                                    strength, filter_weight,
-                                    accumulator, count);
-          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 256,
-                                    count + 256);
-          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 512,
-                                    count + 512);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 68d1d8d..5c3fe93 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -423,13 +423,15 @@
     svc_ctx->temporal_layers = 2;
   }
 
+  for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) {
+    si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
+    si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
+  }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
       i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
-      si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
-      si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
     }
   }
 
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 5f3cfdd..a0f59bf 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -1194,6 +1194,33 @@
   }
 }
 
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   tran_low_t out[32 * 32] = {0};
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4de85a4..4d36e27 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -76,7 +76,7 @@
 specialize qw/vpx_d63f_predictor_4x4/;
 
 add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_he_predictor_4x4/;
@@ -91,7 +91,7 @@
 specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_ve_predictor_4x4/;
@@ -100,16 +100,16 @@
 specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -130,7 +130,7 @@
 specialize qw/vpx_d63e_predictor_8x8/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_8x8/;
@@ -142,22 +142,22 @@
 specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -178,7 +178,7 @@
 specialize qw/vpx_d63e_predictor_16x16/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_16x16/;
@@ -226,7 +226,7 @@
 specialize qw/vpx_d63e_predictor_32x32/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_32x32/;
@@ -754,6 +754,9 @@
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add/;
 
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add/;
 
@@ -802,6 +805,10 @@
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add sse2/;
 
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2/;
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add sse2/;
 
@@ -853,6 +860,9 @@
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add/;
 
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add/;
 
@@ -890,7 +900,15 @@
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 93df92a..22d52a2 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -123,8 +123,10 @@
       %define sec_str sec_stridemp
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -140,8 +142,10 @@
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index a8c1bc6..6f924a7 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -23,17 +23,18 @@
 
 SECTION .text
 
-INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
-  pxor                  m1, m1
+  movd                  m2, [leftq]
   movd                  m0, [aboveq]
-  punpckldq             m0, [leftq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -44,8 +45,9 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -53,7 +55,7 @@
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_4)]
   psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -64,8 +66,8 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -73,7 +75,7 @@
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_4)]
   psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -84,7 +86,7 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
@@ -98,8 +100,8 @@
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw_8)]
   psraw                 m0, 4
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -113,8 +115,8 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -124,8 +126,8 @@
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_8)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -139,8 +141,9 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -150,8 +153,8 @@
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_8)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -165,8 +168,8 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   DEFINE_ARGS dst, stride, stride3
@@ -179,8 +182,8 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   DEFINE_ARGS dst, stride, stride3
@@ -236,14 +239,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -268,14 +268,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -452,7 +449,7 @@
   RESTORE_GOT
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
   movd                  m0, [aboveq]
   movd      [dstq        ], m0
@@ -462,7 +459,7 @@
   movd      [dstq+strideq], m0
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3
@@ -516,6 +513,97 @@
   REP_RET
 
 INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
+.loop:
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
 cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
   movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm
index 88df9b2..d061278 100644
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -34,80 +34,6 @@
 SECTION .text
 
 INIT_MMX ssse3
-cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 4
-  mov                lineq, -2
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  movd      [dstq        ], m1
-  movd      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_MMX ssse3
-cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 8
-  mov                lineq, -4
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM ssse3
-cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 16
-  mov                lineq, -8
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  mova      [dstq        ], m1
-  mova      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM ssse3
-cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 32
-  mov                lineq, -16
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  mova   [dstq           ], m1
-  mova   [dstq        +16], m1
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_MMX ssse3
 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   GET_GOT     goffsetq
 
diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 2675eab..80a330b 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -18,9 +18,13 @@
 
 pw_11585x2: times 8 dw 23170
 
-pw_m2404x2: times 8 dw -2404*2
-pw_m4756x2: times 8 dw -4756*2
-pw_m5520x2: times 8 dw -5520*2
+pw_m2404x2:  times 8 dw  -2404*2
+pw_m4756x2:  times 8 dw  -4756*2
+pw_m5520x2:  times 8 dw  -5520*2
+pw_m8423x2:  times 8 dw  -8423*2
+pw_m9102x2:  times 8 dw  -9102*2
+pw_m10394x2: times 8 dw -10394*2
+pw_m11003x2: times 8 dw -11003*2
 
 pw_16364x2: times 8 dw 16364*2
 pw_16305x2: times 8 dw 16305*2
@@ -29,6 +33,18 @@
 pw_15893x2: times 8 dw 15893*2
 pw_15679x2: times 8 dw 15679*2
 pw_15426x2: times 8 dw 15426*2
+pw_15137x2: times 8 dw 15137*2
+pw_14811x2: times 8 dw 14811*2
+pw_14449x2: times 8 dw 14449*2
+pw_14053x2: times 8 dw 14053*2
+pw_13623x2: times 8 dw 13623*2
+pw_13160x2: times 8 dw 13160*2
+pw_12665x2: times 8 dw 12665*2
+pw_12140x2: times 8 dw 12140*2
+pw__9760x2: times 8 dw  9760*2
+pw__7723x2: times 8 dw  7723*2
+pw__7005x2: times 8 dw  7005*2
+pw__6270x2: times 8 dw  6270*2
 pw__3981x2: times 8 dw  3981*2
 pw__3196x2: times 8 dw  3196*2
 pw__1606x2: times 8 dw  1606*2
@@ -57,6 +73,14 @@
 TRANSFORM_COEFFS    15679,  4756
 TRANSFORM_COEFFS    11585, 11585
 
+; constants for 32x32_1024
+TRANSFORM_COEFFS    12140, 11003
+TRANSFORM_COEFFS     7005, 14811
+TRANSFORM_COEFFS    14053,  8423
+TRANSFORM_COEFFS     9760, 13160
+TRANSFORM_COEFFS    12665, 10394
+TRANSFORM_COEFFS     7723, 14449
+
 %macro PAIR_PP_COEFFS 2
 dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
 %endmacro
@@ -368,23 +392,24 @@
 %define idx30 16 * 6
 %define idx31 16 * 7
 
+; FROM idct32x32_add_neon.asm
+;
+; Instead of doing the transforms stage by stage, it is done by loading
+; some input values and doing as many stages as possible to minimize the
+; storing/loading of intermediate results. To fit within registers, the
+; final coefficients are cut into four blocks:
+; BLOCK A: 16-19,28-31
+; BLOCK B: 20-23,24-27
+; BLOCK C: 8-11,12-15
+; BLOCK D: 0-3,4-7
+; Blocks A and C are straight calculation through the various stages. In
+; block B, further calculations are performed using the results from
+; block A. In block D, further calculations are performed using the results
+; from block C and then the final calculations are done using results from
+; block A and B which have been combined at the end of block B.
+;
+
 %macro IDCT32X32_34 4
-  ; FROM idct32x32_add_neon.asm
-  ;
-  ; Instead of doing the transforms stage by stage, it is done by loading
-  ; some input values and doing as many stages as possible to minimize the
-  ; storing/loading of intermediate results. To fit within registers, the
-  ; final coefficients are cut into four blocks:
-  ; BLOCK A: 16-19,28-31
-  ; BLOCK B: 20-23,24-27
-  ; BLOCK C: 8-11,12-15
-  ; BLOCK D: 0-3,4-7
-  ; Blocks A and C are straight calculation through the various stages. In
-  ; block B, further calculations are performed using the results from
-  ; block A. In block D, further calculations are performed using the results
-  ; from block C and then the final calculations are done using results from
-  ; block A and B which have been combined at the end of block B.
-  ;
   ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   mova                m11, m1
   pmulhrsw             m1, [pw___804x2] ; stp1_16
@@ -475,7 +500,7 @@
   ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 %if 0 ; overflow occurs in SUM_SUB when using test streams
   mova                m10, [pw_11585x2]
-  SUM_SUB               6,    5,  9
+  SUM_SUB               6,  5, 9
   pmulhrsw             m6, m10  ; stp1_27
   pmulhrsw             m5, m10  ; stp1_20
   SUM_SUB              13, 14,  9
@@ -539,10 +564,10 @@
   ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 %if 0 ; overflow occurs in SUM_SUB when using test streams
   mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
+  SUM_SUB               5,  4, 9
   pmulhrsw             m5, m10  ; stp1_13
   pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
+  SUM_SUB               6,  7, 9
   pmulhrsw             m6, m10  ; stp1_12
   pmulhrsw             m7, m10  ; stp1_11
 %else
@@ -783,4 +808,887 @@
   RECON_AND_STORE pass_two_start
 
   RET
+
+%macro IDCT32X32_135 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, m3
+  pmulhrsw             m3, [pw__7005x2] ; stp1_18
+  pmulhrsw             m4, [pw_14811x2] ; stp2_29
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, m0
+  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
+  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  mova                m13, m14
+  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
+  pmulhrsw            m14, [pw_14053x2] ; stp2_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__9760x2] ; stp1_22
+  pmulhrsw             m1, [pw_13160x2] ; stp2_25
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, m4
+  pmulhrsw             m4, [pw__7723x2] ; stp1_10
+  pmulhrsw             m5, [pw_14449x2] ; stp2_13
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
+  pmulhrsw             m2, [pw_12665x2] ; stp2_14
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, m13
+  pmulhrsw            m13, [pw_13623x2] ; stp1_6
+  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
+  mova                 m3, m2
+  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
+  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
+
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m1, m0    ; stp1_0 = stp1_1
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 2
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_135:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 2
+
+idct32x32_135_transpose:
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose
+
+  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+  lea         inputq, [inputq + 16 * 32]
+  dec             r6
+  jnz idct32x32_135
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_135_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 2
+
+idct32x32_135_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose_2
+
+  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_135_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
+%macro IDCT32X32_1024 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, [rsp + transposed_in + 16 * 31]
+  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, [rsp + transposed_in + 16 * 17]
+  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, [rsp + transposed_in + 16 * 25]
+  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, [rsp + transposed_in + 16 * 23]
+  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, [rsp + transposed_in + 16 * 27]
+  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
+
+  mova                m13, [rsp + transposed_in + 16 * 21]
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, [rsp + transposed_in + 16 * 19]
+  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
+
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, [rsp + transposed_in + 16 * 29]
+  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, [rsp + transposed_in + 16 * 30]
+  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, [rsp + transposed_in + 16 * 18]
+  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, [rsp + transposed_in + 16 * 22]
+  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, [rsp + transposed_in + 16 * 26]
+  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, [rsp + transposed_in + 16 * 28]
+  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, [rsp + transposed_in + 16 * 20]
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m1, [rsp + transposed_in + 16 * 16]
+
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               0,    1,  9
+  pmulhrsw             m0, m10  ; stp1_1
+  pmulhrsw             m1, m10  ; stp1_0
+%else
+  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
+  SWAP  0, 1
+%endif
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  mova                 m3, [rsp + transposed_in + 16 * 24]
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
+
+  mova                m10, [pw_11585x2]
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_1024:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 4
+
+idct32x32_1024_transpose:
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose
+
+  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
+
+  lea            stp, [stp + 16 * 8]
+  lea         inputq, [inputq + 16 * 32]
+  dec             r6
+  jnz idct32x32_1024
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_1024_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 4
+
+idct32x32_1024_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose_2
+
+  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_1024_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
 %endif
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 05dcff7..c655e4b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,8 +139,10 @@
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -156,8 +158,10 @@
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
 %if ABI_IS_32BIT
   %if CONFIG_PIC=1
   %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
     %define WRT_PLT wrt ..plt
     %macro GET_GOT 1
       extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
       %define RESTORE_GOT pop %1
     %endmacro
   %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
     %macro GET_GOT 1
       push %1
       call %%get_got