Merge "Alternate reference frame" into nextgenv2
diff --git a/.mailmap b/.mailmap
index 0bfda12..42f3617 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,14 +1,21 @@
 Adrian Grange <agrange@google.com>
-Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
+Hui Su <huisu@google.com>
+Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
@@ -17,10 +24,13 @@
 Paul Wilkins <paulwilkins@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
+Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
diff --git a/AUTHORS b/AUTHORS
index 2f63d7c..f89b677 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -5,9 +5,9 @@
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -16,8 +16,10 @@
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Russell <anrussell@google.com>
+Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
@@ -27,6 +29,7 @@
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
@@ -34,6 +37,8 @@
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
@@ -44,7 +49,7 @@
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
-JackyChen <jackychen@google.com>
+Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
@@ -60,9 +65,11 @@
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
+Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
@@ -82,6 +89,7 @@
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
+Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -96,7 +104,7 @@
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
-Ronald S. Bultje <rbultje@google.com>
+Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
@@ -104,6 +112,7 @@
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
diff --git a/CHANGELOG b/CHANGELOG
index b0d3064..7746cc6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,19 @@
-xxxx-yy-zz v1.4.0 "Changes for next release"
-  vpxenc is changed to use VP9 by default.
-  Encoder controls added for 1 pass SVC.
-  Decoder control to toggle on/off loopfilter.
+2015-11-09 v1.5.0 "Javan Whistling Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
+    controls and adds a variety of VP9 controls for testing.
+
+    The vpxenc utility now prefers VP9 by default.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Smaller library size by combining functions used by VP8 and VP9
+
+  - Bug Fixes:
+    A variety of fuzzing issues
 
 2015-04-03 v1.4.0 "Indian Runner Duck"
   This release includes significant improvements to the VP9 codec.
diff --git a/configure b/configure
index eca4a2b..315c427 100755
--- a/configure
+++ b/configure
@@ -273,6 +273,7 @@
     ext_inter
     ext_interp
     ext_refs
+    supertx
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/libs.mk b/libs.mk
index c65df6b..e6fb068 100644
--- a/libs.mk
+++ b/libs.mk
@@ -260,7 +260,7 @@
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 2
+SO_VERSION_MAJOR := 3
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
@@ -429,12 +429,10 @@
           if [ -n "$${sha1sum}" ]; then\
             set -e;\
             echo "Checking test data:";\
-            if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
-                for f in $(call enabled,LIBVPX_TEST_DATA); do\
-                    grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                        (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
-                done; \
-            fi; \
+            for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+            done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
         fi
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 98b6f87..bc91fe2 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -286,11 +286,11 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
+class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
   public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
-  ResizeInternalRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeInternalRealtimeTest() {}
+  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeRealtimeTest() {}
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
@@ -318,8 +318,6 @@
   }
 
   void DefaultConfig() {
-    cfg_.g_w = 352;
-    cfg_.g_h = 288;
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
     cfg_.rc_buf_sz = 1000;
@@ -346,13 +344,34 @@
   bool change_bitrate_;
 };
 
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  DefaultConfig();
+  change_bitrate_ = false;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
+    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
+
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+  }
+}
+
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
-TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 299);
   DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
   change_bitrate_ = false;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
@@ -378,15 +397,17 @@
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Start at low target bitrate, raise the bitrate in the middle of the clip,
 // scaling-up should occur after bitrate changed.
-TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
+                                       30, 1, 0, 359);
   DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
   change_bitrate_ = true;
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
-  cfg_.rc_target_bitrate = 100;
+  cfg_.rc_target_bitrate = 80;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -411,7 +432,7 @@
   }
 
   // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(2, resize_count) << "Resizing should occur twice.";
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
 }
 
 vpx_img_fmt_t CspForFrameNumber(int frame) {
@@ -524,7 +545,7 @@
                           ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                           ::testing::Values(::libvpx_test::kOnePassBest));
-VP9_INSTANTIATE_TEST_CASE(ResizeInternalRealtimeTest,
+VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
                           ::testing::Values(::libvpx_test::kRealTime),
                           ::testing::Range(5, 9));
 VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
diff --git a/test/test.mk b/test/test.mk
index 0ac9a8a..face2ad 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -173,6 +173,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
 
 endif # CONFIG_SHARED
 
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
new file mode 100644
index 0000000..e6416cc
--- /dev/null
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm2d.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int txfm_size_num = 4;
+const int txfm_size_ls[4] = {4, 8, 16, 32};
+const TXFM_2D_CFG fwd_txfm_cfg_ls[4][4] = {
+    {fwd_txfm_2d_cfg_dct_dct_4, fwd_txfm_2d_cfg_dct_adst_4,
+     fwd_txfm_2d_cfg_adst_adst_4, fwd_txfm_2d_cfg_adst_dct_4},
+    {fwd_txfm_2d_cfg_dct_dct_8, fwd_txfm_2d_cfg_dct_adst_8,
+     fwd_txfm_2d_cfg_adst_adst_8, fwd_txfm_2d_cfg_adst_dct_8},
+    {fwd_txfm_2d_cfg_dct_dct_16, fwd_txfm_2d_cfg_dct_adst_16,
+     fwd_txfm_2d_cfg_adst_adst_16, fwd_txfm_2d_cfg_adst_dct_16},
+    {fwd_txfm_2d_cfg_dct_dct_32, fwd_txfm_2d_cfg_dct_adst_32,
+     fwd_txfm_2d_cfg_adst_adst_32, fwd_txfm_2d_cfg_adst_dct_32}};
+
+const Fwd_Txfm2d_Func fwd_txfm_func_ls[4] = {
+    vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
+    vp10_fwd_txfm2d_32x32};
+
+const int txfm_type_num = 4;
+const TYPE_TXFM type_ls_0[4] = {TYPE_DCT, TYPE_DCT, TYPE_ADST, TYPE_ADST};
+const TYPE_TXFM type_ls_1[4] = {TYPE_DCT, TYPE_ADST, TYPE_ADST, TYPE_DCT};
+
+TEST(vp10_fwd_txfm2d, accuracy) {
+  for (int txfm_size_idx = 0; txfm_size_idx < txfm_size_num; ++txfm_size_idx) {
+    int txfm_size = txfm_size_ls[txfm_size_idx];
+    int sqr_txfm_size = txfm_size * txfm_size;
+    int16_t* input = new int16_t[sqr_txfm_size];
+    int32_t* output = new int32_t[sqr_txfm_size];
+    double* ref_input = new double[sqr_txfm_size];
+    double* ref_output = new double[sqr_txfm_size];
+
+    for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
+         ++txfm_type_idx) {
+      TXFM_2D_CFG fwd_txfm_cfg = fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
+      Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
+      TYPE_TXFM type0 = type_ls_0[txfm_type_idx];
+      TYPE_TXFM type1 = type_ls_1[txfm_type_idx];
+      int amplify_bit =
+          fwd_txfm_cfg.shift[0] + fwd_txfm_cfg.shift[1] + fwd_txfm_cfg.shift[2];
+      double amplify_factor =
+          amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      int count = 5000;
+      double avg_abs_error = 0;
+      for (int ci = 0; ci < count; ci++) {
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % base;
+          ref_input[ni] = static_cast<double>(input[ni]);
+          output[ni] = 0;
+          ref_output[ni] = 0;
+        }
+
+        fwd_txfm_func(input, output, txfm_size, &fwd_txfm_cfg, bd);
+        reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
+
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          ref_output[ni] = round(ref_output[ni] * amplify_factor);
+          EXPECT_LE(fabs(output[ni] - ref_output[ni]) / amplify_factor, 30);
+        }
+        avg_abs_error += compute_avg_abs_error<int32_t, double>(
+            output, ref_output, sqr_txfm_size);
+      }
+
+      avg_abs_error /= amplify_factor;
+      avg_abs_error /= count;
+      // max_abs_avg_error comes from upper bound of avg_abs_error
+      // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+      // %f\n", type0, type1, txfm_size, avg_abs_error);
+      double max_abs_avg_error = 1.5;
+      EXPECT_LE(avg_abs_error, max_abs_avg_error);
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+
+}  // anonymous namespace
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index c4dce60..03e34e0 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -89,7 +89,6 @@
   // 1: an ext intra mode is used; 0: otherwise.
   uint8_t use_ext_intra_mode[PLANE_TYPES];
   EXT_INTRA_MODE ext_intra_mode[PLANE_TYPES];
-  uint8_t ext_intra_angle[PLANE_TYPES];
 } EXT_INTRA_MODE_INFO;
 #endif  // CONFIG_EXT_INTRA
 
@@ -124,6 +123,7 @@
 
 #if CONFIG_EXT_INTRA
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int8_t angle_delta[2];
 #endif  // CONFIG_EXT_INTRA
 
   // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
@@ -328,14 +328,14 @@
 #endif  // CONFIG_EXT_TX
 
 #if CONFIG_EXT_INTRA
-// 0: use both directional and filter modes; 1: use directional modes only.
-#define DR_ONLY 0
-// 0: use slow exhaustive search; 1: use fast sub-optimal search.
+#define ALLOW_FILTER_INTRA_MODES 1
+#define ANGLE_STEP 3
+#define MAX_ANGLE_DELTAS 3
 #define ANGLE_FAST_SEARCH 1
-// A parameter to adjust early termination in the fast search of angles.
-#define RD_ADJUSTER 1.4
-// Number of different angles that are supported
-#define EXT_INTRA_ANGLES 128
+
+static uint8_t mode_to_angle_map[INTRA_MODES] = {
+    0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
+};
 
 static const TX_TYPE filter_intra_mode_to_tx_type_lookup[FILTER_INTRA_MODES] = {
   DCT_DCT,    // FILTER_DC
@@ -349,13 +349,6 @@
   ADST_DCT,   // FILTER_D63
   ADST_ADST,  // FILTER_TM
 };
-
-// Maps the angle index to the actual prediction angle (in degrees).
-// Angle index is in the range [0, EXT_INTRA_ANGLES); the actual prediction
-// angle is in the range (0, 270).
-static INLINE int prediction_angle_map(int angle_in) {
-  return (10 + 2 * angle_in);
-}
 #endif  // CONFIG_EXT_INTRA
 
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
@@ -363,33 +356,44 @@
                                   int block_idx, TX_SIZE tx_size) {
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
-#if CONFIG_EXT_INTRA
-  const int use_ext_intra_mode_info =
-      mbmi->ext_intra_mode_info.use_ext_intra_mode[plane_type];
-  const EXT_INTRA_MODE ext_intra_mode =
-      mbmi->ext_intra_mode_info.ext_intra_mode[plane_type];
 
-  if (!is_inter_block(mbmi) && use_ext_intra_mode_info) {
-    if (!xd->lossless[mbmi->segment_id] && tx_size < TX_32X32
+#if CONFIG_EXT_INTRA
+  if (!is_inter_block(mbmi)) {
+    const int use_ext_intra_mode_info =
+        mbmi->ext_intra_mode_info.use_ext_intra_mode[plane_type];
+    const EXT_INTRA_MODE ext_intra_mode =
+        mbmi->ext_intra_mode_info.ext_intra_mode[plane_type];
+    const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y) ?
+        get_y_mode(mi, block_idx) : mbmi->uv_mode;
+
+    if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+      return DCT_DCT;
+
 #if CONFIG_EXT_TX
-        && !(mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y)
+    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y)
+      return mbmi->tx_type;
 #endif  // CONFIG_EXT_TX
-    ) {
-      if (ext_intra_mode > FILTER_TM_PRED) {
-        int angle = mbmi->ext_intra_mode_info.ext_intra_angle[plane_type];
-        angle = prediction_angle_map(angle);
-        assert(angle > 0 && angle < 270);
-        if (angle == 135)
-          return ADST_ADST;
-        else if (angle < 45 || angle > 225)
-          return DCT_DCT;
-        else if (angle < 135)
-          return ADST_DCT;
-        else
-          return DCT_ADST;
-      } else {
-        return filter_intra_mode_to_tx_type_lookup[ext_intra_mode];
-      }
+
+    if (use_ext_intra_mode_info)
+      return filter_intra_mode_to_tx_type_lookup[ext_intra_mode];
+
+    if (mode == DC_PRED) {
+      return DCT_DCT;
+    } else if (mode == TM_PRED) {
+      return ADST_ADST;
+    } else {
+      int angle = mode_to_angle_map[mode];
+      if (mbmi->sb_type >= BLOCK_8X8)
+        angle += mbmi->angle_delta[plane_type] * ANGLE_STEP;
+      assert(angle > 0 && angle < 270);
+      if (angle == 135)
+        return ADST_ADST;
+      else if (angle < 45 || angle > 225)
+        return DCT_DCT;
+      else if (angle < 135)
+        return ADST_DCT;
+      else
+        return DCT_ADST;
     }
   }
 #endif  // CONFIG_EXT_INTRA
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index f2502b9..ceb55df 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -774,7 +774,23 @@
 #if CONFIG_EXT_TX
 const vpx_tree_index vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER]
                                            [TREE_SIZE(TX_TYPES)] = {
-  {
+  { // ToDo(yaowu): remove used entry 0.
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
@@ -809,7 +825,23 @@
 
 const vpx_tree_index vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
                                            [TREE_SIZE(TX_TYPES)] = {
-  {
+  {  // ToDo(yaowu): remove unused entry 0.
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
@@ -832,8 +864,17 @@
 
 static const vpx_prob
 default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
-  {
-    // unused
+  { // ToDo(yaowu): remove unused entry 0.
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+#if EXT_TX_SIZES == 4
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+#endif
   }, {
     { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
       128 },
@@ -865,8 +906,94 @@
 static const vpx_prob
 default_intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES]
                          [INTRA_MODES][TX_TYPES - 1] = {
-  {
-    // unused
+  { // ToDo(yaowu): remove unused entry 0.
+    {
+      { 8, 11, 24, 112, 87, 137, 127, 134,
+      128, 86, 128, 124, 125, 133, 176, 123, },
+      { 10, 9, 39, 106, 73, 155, 163, 228,
+      35, 62, 129, 127, 133, 114, 213, 234, },
+      { 10, 9, 14, 88, 91, 127, 151, 51,
+      210, 89, 126, 58, 52, 116, 217, 24, },
+      { 9, 6, 29, 113, 98, 131, 149, 210,
+      119, 60, 124, 93, 90, 143, 170, 197, },
+      { 8, 8, 38, 101, 111, 166, 167, 141,
+      130, 105, 128, 75, 75, 118, 197, 117, },
+      { 7, 8, 39, 91, 101, 153, 166, 200,
+      99, 77, 123, 90, 83, 144, 224, 192, },
+      { 7, 10, 26, 86, 119, 154, 130, 101,
+      152, 91, 129, 75, 79, 137, 219, 77, },
+      { 10, 13, 20, 86, 102, 162, 112, 76,
+      171, 86, 134, 122, 106, 124, 196, 44, },
+      { 8, 9, 33, 108, 100, 144, 148, 215,
+      77, 60, 125, 125, 128, 126, 198, 220, },
+      { 3, 10, 29, 111, 69, 141, 204, 141,
+      139, 93, 120, 75, 77, 163, 242, 124, },
+    }, {
+      { 2, 53, 18, 147, 96, 98, 136, 133,
+      131, 120, 153, 163, 169, 137, 173, 124, },
+      { 4, 18, 34, 133, 54, 130, 179, 228,
+      28, 72, 153, 164, 168, 118, 227, 239, },
+      { 4, 18, 13, 125, 72, 110, 176, 36,
+      221, 104, 148, 75, 72, 117, 225, 19, },
+      { 8, 33, 24, 162, 113, 99, 147, 226,
+      103, 85, 153, 143, 153, 124, 155, 210, },
+      { 2, 15, 35, 107, 127, 158, 192, 128,
+      126, 116, 151, 95, 88, 182, 241, 119, },
+      { 3, 15, 36, 112, 100, 146, 194, 189,
+      90, 98, 152, 99, 100, 165, 235, 175, },
+      { 3, 16, 29, 109, 103, 140, 182, 76,
+      173, 104, 147, 82, 85, 159, 235, 70, },
+      { 9, 24, 14, 120, 86, 156, 161, 34,
+      177, 121, 142, 128, 128, 126, 185, 37, },
+      { 5, 24, 29, 152, 98, 99, 174, 228,
+      82, 76, 147, 149, 128, 132, 191, 225, },
+      { 2, 15, 29, 111, 77, 126, 200, 135,
+      117, 93, 152, 96, 84, 191, 245, 135, },
+    }, {
+      { 2, 69, 13, 173, 111, 69, 137, 159,
+      159, 146, 151, 193, 203, 131, 180, 123, },
+      { 1, 12, 33, 164, 32, 98, 204, 242,
+      23, 99, 149, 215, 232, 110, 239, 245, },
+      { 1, 17, 9, 136, 82, 83, 171, 28,
+      231, 128, 135, 76, 64, 118, 235, 17, },
+      { 4, 41, 17, 195, 131, 58, 161, 237,
+      141, 97, 153, 189, 191, 117, 182, 202, },
+      { 2, 17, 36, 104, 149, 137, 217, 139,
+      191, 119, 125, 107, 115, 223, 249, 110, },
+      { 2, 14, 24, 127, 91, 135, 219, 198,
+      113, 91, 164, 125, 173, 211, 250, 116, },
+      { 3, 19, 24, 120, 102, 130, 209, 81,
+      187, 95, 143, 102, 50, 190, 244, 56, },
+      { 4, 27, 10, 128, 91, 157, 181, 33,
+      181, 150, 141, 141, 166, 114, 215, 25, },
+      { 2, 34, 27, 187, 102, 77, 210, 245,
+      113, 107, 136, 184, 188, 121, 210, 234, },
+      { 1, 15, 22, 141, 59, 94, 208, 133,
+      154, 95, 152, 112, 105, 191, 242, 111, },
+#if EXT_TX_SIZES == 4
+    }, {
+      { 2, 69, 13, 173, 111, 69, 137, 159,
+      159, 146, 151, 193, 203, 131, 180, 123, },
+      { 1, 12, 33, 164, 32, 98, 204, 242,
+      23, 99, 149, 215, 232, 110, 239, 245, },
+      { 1, 17, 9, 136, 82, 83, 171, 28,
+      231, 128, 135, 76, 64, 118, 235, 17, },
+      { 4, 41, 17, 195, 131, 58, 161, 237,
+      141, 97, 153, 189, 191, 117, 182, 202, },
+      { 2, 17, 36, 104, 149, 137, 217, 139,
+      191, 119, 125, 107, 115, 223, 249, 110, },
+      { 2, 14, 24, 127, 91, 135, 219, 198,
+      113, 91, 164, 125, 173, 211, 250, 116, },
+      { 3, 19, 24, 120, 102, 130, 209, 81,
+      187, 95, 143, 102, 50, 190, 244, 56, },
+      { 4, 27, 10, 128, 91, 157, 181, 33,
+      181, 150, 141, 141, 166, 114, 215, 25, },
+      { 2, 34, 27, 187, 102, 77, 210, 245,
+      113, 107, 136, 184, 188, 121, 210, 234, },
+      { 1, 15, 22, 141, 59, 94, 208, 133,
+      154, 95, 152, 112, 105, 191, 242, 111, },
+#endif
+    },
   }, {
     {
       {   8,  11,  24, 112,  87, 137, 127, 134,
@@ -967,7 +1094,7 @@
 #endif
 
 #if CONFIG_EXT_INTRA
-static  const vpx_prob default_ext_intra_probs[2] = {200, 200};
+static  const vpx_prob default_ext_intra_probs[2] = {230, 230};
 #endif  // CONFIG_EXT_INTRA
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 00eacc5..2b5c948 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -32,11 +32,6 @@
 #define PALETTE_BLOCK_SIZES (BLOCK_64X64 - BLOCK_8X8 + 1)
 #define PALETTE_Y_MODE_CONTEXTS 3
 
-#if CONFIG_EXT_INTRA
-// Probability that an ext_intra mode is a directional prediction mode
-#define DR_EXT_INTRA_PROB 144
-#endif  // CONFIG_EXT_INTRA
-
 struct VP10Common;
 
 struct tx_probs {
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 56e9c90..3f9395e 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -168,7 +168,6 @@
   FILTER_D207_PRED,
   FILTER_D63_PRED,
   FILTER_TM_PRED,
-  EXT_DR_PRED,
   EXT_INTRA_MODES,
 } EXT_INTRA_MODE;
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 28aa915..87dc13a 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -264,7 +264,7 @@
 }
 
 #if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
@@ -405,32 +405,11 @@
   }
 }
 
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) left;
-
-  for (r = 0; r < bs; r++) {
-    memcpy(dst, above, bs);
-    dst += stride;
-  }
-}
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void) above;
-
-  for (r = 0; r < bs; r++) {
-    memset(dst, left[r], bs);
-    dst += stride;
-  }
-}
-
-static void dr_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                          const uint8_t *above, const uint8_t *left, int angle) {
   double t = 0;
   int dx, dy;
+  int bs = 4 << tx_size;
 
   if (angle != 90 && angle != 180)
     t = tan(angle * PI / 180.0);
@@ -448,9 +427,9 @@
     dy = -((int)(256 * t));
     dr_prediction_z3(dst, stride, bs, above, left, dx, dy);
   } else if (angle == 90) {
-    v_predictor(dst, stride, bs, above, left);
+    pred[V_PRED][tx_size](dst, stride, above, left);
   } else if (angle == 180) {
-    h_predictor(dst, stride, bs, above, left);
+    pred[H_PRED][tx_size](dst, stride, above, left);
   }
 }
 
@@ -915,11 +894,7 @@
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-#if CONFIG_MISC_FIXES
-  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
-#else
   DECLARE_ALIGNED(16, uint16_t, left_col[64]);
-#endif
   DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
@@ -946,37 +921,38 @@
       &xd->mi[0]->mbmi.ext_intra_mode_info;
   const EXT_INTRA_MODE ext_intra_mode =
       ext_intra_mode_info->ext_intra_mode[plane != 0];
-  const int angle =
-      prediction_angle_map(ext_intra_mode_info->ext_intra_angle[plane != 0]);
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+#if CONFIG_MISC_FIXES
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+#else
+    if (p_angle < 90)
+      need_above = 0, need_aboveright = 1, need_left = 0;
+    else if (p_angle == 90)
+      need_above = 1, need_aboveright = 0, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_aboveright = 0, need_left = 1;
+    else
+      need_above = 0, need_aboveright = 0, need_left = 1;
+#endif  // CONFIG_MISC_FIXES
+  }
 
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
     EXT_INTRA_MODE ext_intra_mode =
         ext_intra_mode_info->ext_intra_mode[plane != 0];
-    if (ext_intra_mode <= FILTER_TM_PRED) {
-      need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
-      need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
-      need_aboveright =
-          ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVERIGHT;
-    } else {
-      assert(angle > 0 && angle < 270);
-#if CONFIG_MISC_FIXES
-      if (angle <= 90)
-        need_above = 1, need_left = 0;
-      else if (angle < 180)
-        need_above = 1, need_left = 1;
-      else
-        need_above = 0, need_left = 1;
-#else
-      if (angle < 90)
-        need_above = 0, need_aboveright = 1, need_left = 0;
-      else if (angle == 90)
-        need_above = 1, need_aboveright = 0, need_left = 0;
-      else if (angle < 180)
-        need_above = 1, need_aboveright = 0, need_left = 1;
-      else
-        need_above = 0, need_aboveright = 0, need_left = 1;
-#endif  // CONFIG_MISC_FIXES
-    }
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+    need_aboveright =
+        ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVERIGHT;
   }
 #endif  // CONFIG_EXT_INTRA
 
@@ -993,10 +969,10 @@
 #if CONFIG_EXT_INTRA
     int need_bottom;
     if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-      if (ext_intra_mode <= FILTER_TM_PRED)
         need_bottom = 0;
-      else
-        need_bottom = angle > 180;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+        need_bottom = p_angle > 180;
     } else {
       need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
     }
@@ -1024,10 +1000,10 @@
 #if CONFIG_EXT_INTRA
     int need_right;
     if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-      if (ext_intra_mode <= FILTER_TM_PRED)
-        need_right = 1;
-      else
-        need_right = angle < 90;
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
     } else {
       need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
     }
@@ -1052,7 +1028,9 @@
   (void)need_aboveright;
 #if CONFIG_EXT_INTRA
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
-      (extend_modes[mode] & NEED_ABOVELEFT)) {
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ?
         (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
   }
@@ -1185,13 +1163,15 @@
 
 #if CONFIG_EXT_INTRA
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-    if (ext_intra_mode <= FILTER_TM_PRED)
-      highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
-                                                     const_above_row, left_col,
-                                                     bd);
-    else
-      highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col,
-                          angle, bd);
+    highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col, bd);
+    return;
+  }
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col,
+                        p_angle, bd);
     return;
   }
 #endif  // CONFIG_EXT_INTRA
@@ -1247,37 +1227,39 @@
       &xd->mi[0]->mbmi.ext_intra_mode_info;
   const EXT_INTRA_MODE ext_intra_mode =
       ext_intra_mode_info->ext_intra_mode[plane != 0];
-  const int angle =
-      prediction_angle_map(ext_intra_mode_info->ext_intra_angle[plane != 0]);
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+
+#if CONFIG_MISC_FIXES
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+#else
+    if (p_angle < 90)
+      need_above = 0, need_aboveright = 1, need_left = 0;
+    else if (p_angle == 90)
+      need_above = 1, need_aboveright = 0, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_aboveright = 0, need_left = 1;
+    else
+      need_above = 0, need_aboveright = 0, need_left = 1;
+#endif  // CONFIG_MISC_FIXES
+  }
 
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
     EXT_INTRA_MODE ext_intra_mode =
         ext_intra_mode_info->ext_intra_mode[plane != 0];
-    if (ext_intra_mode <= FILTER_TM_PRED) {
-      need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
-      need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
-      need_aboveright =
-          ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVERIGHT;
-    } else {
-      assert(angle > 0 && angle < 270);
-#if CONFIG_MISC_FIXES
-      if (angle <= 90)
-        need_above = 1, need_left = 0;
-      else if (angle < 180)
-        need_above = 1, need_left = 1;
-      else
-        need_above = 0, need_left = 1;
-#else
-      if (angle < 90)
-        need_above = 0, need_aboveright = 1, need_left = 0;
-      else if (angle == 90)
-        need_above = 1, need_aboveright = 0, need_left = 0;
-      else if (angle < 180)
-        need_above = 1, need_aboveright = 0, need_left = 1;
-      else
-        need_above = 0, need_aboveright = 0, need_left = 1;
-#endif  // CONFIG_MISC_FIXES
-    }
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+    need_aboveright =
+        ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVERIGHT;
   }
 #endif  // CONFIG_EXT_INTRA
 
@@ -1318,10 +1300,10 @@
 #if CONFIG_EXT_INTRA
     int need_bottom;
     if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-      if (ext_intra_mode <= FILTER_TM_PRED)
-        need_bottom = 0;
-      else
-        need_bottom = angle > 180;
+      need_bottom = 0;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_bottom = p_angle > 180;
     } else {
       need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
     }
@@ -1373,10 +1355,10 @@
 #if CONFIG_EXT_INTRA
     int need_right;
     if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-      if (ext_intra_mode <= FILTER_TM_PRED)
-        need_right = 1;
-      else
-        need_right = angle < 90;
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
     } else {
       need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
     }
@@ -1428,7 +1410,9 @@
   (void)need_aboveright;
 #if CONFIG_EXT_INTRA
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
-      (extend_modes[mode] & NEED_ABOVELEFT)) {
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+          xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
 #else
@@ -1486,11 +1470,14 @@
 
 #if CONFIG_EXT_INTRA
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-    if (ext_intra_mode <= FILTER_TM_PRED)
-      filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
-                                              const_above_row, left_col);
-    else
-      dr_predictor(dst, dst_stride, bs, const_above_row, left_col, angle);
+    filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col);
+    return;
+  }
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col, p_angle);
     return;
   }
 #endif  // CONFIG_EXT_INTRA
@@ -1510,10 +1497,10 @@
 }
 
 void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
-                             TX_SIZE tx_size, PREDICTION_MODE mode,
-                             const uint8_t *ref, int ref_stride,
-                             uint8_t *dst, int dst_stride,
-                             int aoff, int loff, int plane) {
+                              TX_SIZE tx_size, PREDICTION_MODE mode,
+                              const uint8_t *ref, int ref_stride,
+                              uint8_t *dst, int dst_stride,
+                              int aoff, int loff, int plane) {
   const int txw = (1 << tx_size);
   const int have_top = loff || xd->up_available;
   const int have_left = aoff || xd->left_available;
diff --git a/vp10/common/vp10_inv_txfm2d_cfg.h b/vp10/common/vp10_inv_txfm2d_cfg.h
new file mode 100644
index 0000000..8cd76b5
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d_cfg.h
@@ -0,0 +1,377 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM2D_CFG_H_
+#define VP10_INV_TXFM2D_CFG_H_
+#include "vp10/common/vp10_inv_txfm1d.h"
+
+//  ---------------- config inv_dct_dct_4 ----------------
+static const int8_t inv_shift_dct_dct_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_dct_dct_4[4] = {17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_dct_4[4] = {16, 16, 16, 16};
+static const int8_t inv_cos_bit_col_dct_dct_4[4] = {15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_dct_4[4] = {15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 4,
+
+    .shift = inv_shift_dct_dct_4,
+    .stage_range_col = inv_stage_range_col_dct_dct_4,
+    .stage_range_row = inv_stage_range_row_dct_dct_4,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_4,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_4,
+    .txfm_func_col = vp10_idct4_new,
+    .txfm_func_row = vp10_idct4_new};
+
+//  ---------------- config inv_dct_dct_8 ----------------
+static const int8_t inv_shift_dct_dct_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_dct_dct_8[6] = {17, 17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_dct_8[6] = {17, 17, 17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_dct_dct_8[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_dct_8[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_dct_dct_8,
+    .stage_range_col = inv_stage_range_col_dct_dct_8,
+    .stage_range_row = inv_stage_range_row_dct_dct_8,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_8,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_8,
+    .txfm_func_col = vp10_idct8_new,
+    .txfm_func_row = vp10_idct8_new};
+
+//  ---------------- config inv_dct_dct_16 ----------------
+static const int8_t inv_shift_dct_dct_16[2] = {0, -6};
+static const int8_t inv_stage_range_col_dct_dct_16[8] = {18, 18, 18, 18,
+                                                         18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_dct_16[8] = {18, 18, 18, 18,
+                                                         18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_dct_16[8] = {14, 14, 14, 14,
+                                                     14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_dct_16[8] = {14, 14, 14, 14,
+                                                     14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_dct_dct_16,
+    .stage_range_col = inv_stage_range_col_dct_dct_16,
+    .stage_range_row = inv_stage_range_row_dct_dct_16,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_16,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_16,
+    .txfm_func_col = vp10_idct16_new,
+    .txfm_func_row = vp10_idct16_new};
+
+//  ---------------- config inv_dct_dct_32 ----------------
+static const int8_t inv_shift_dct_dct_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_dct_dct_32[10] = {18, 18, 18, 18, 18,
+                                                          18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_dct_32[10] = {19, 19, 19, 19, 19,
+                                                          19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_dct_32[10] = {14, 14, 14, 14, 14,
+                                                      14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_dct_32[10] = {13, 13, 13, 13, 13,
+                                                      13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_dct_dct_32,
+    .stage_range_col = inv_stage_range_col_dct_dct_32,
+    .stage_range_row = inv_stage_range_row_dct_dct_32,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_32,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_32,
+    .txfm_func_col = vp10_idct32_new,
+    .txfm_func_row = vp10_idct32_new};
+
+//  ---------------- config inv_dct_adst_4 ----------------
+static const int8_t inv_shift_dct_adst_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_dct_adst_4[4] = {17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_adst_4[6] = {16, 16, 16,
+                                                         16, 16, 16};
+static const int8_t inv_cos_bit_col_dct_adst_4[4] = {15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_adst_4[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_dct_adst_4,
+    .stage_range_col = inv_stage_range_col_dct_adst_4,
+    .stage_range_row = inv_stage_range_row_dct_adst_4,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_4,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_4,
+    .txfm_func_col = vp10_idct4_new,
+    .txfm_func_row = vp10_iadst4_new};
+
+//  ---------------- config inv_dct_adst_8 ----------------
+static const int8_t inv_shift_dct_adst_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_dct_adst_8[6] = {16, 16, 16,
+                                                         16, 15, 15};
+static const int8_t inv_stage_range_row_dct_adst_8[8] = {17, 17, 17, 17,
+                                                         17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_dct_adst_8[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_adst_8[8] = {15, 15, 15, 15,
+                                                     15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_dct_adst_8,
+    .stage_range_col = inv_stage_range_col_dct_adst_8,
+    .stage_range_row = inv_stage_range_row_dct_adst_8,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_8,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_8,
+    .txfm_func_col = vp10_idct8_new,
+    .txfm_func_row = vp10_iadst8_new};
+
+//  ---------------- config inv_dct_adst_16 ----------------
+static const int8_t inv_shift_dct_adst_16[2] = {1, -7};
+static const int8_t inv_stage_range_col_dct_adst_16[8] = {19, 19, 19, 19,
+                                                          19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_adst_16[10] = {18, 18, 18, 18, 18,
+                                                           18, 18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_adst_16[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 14};
+static const int8_t inv_cos_bit_row_dct_adst_16[10] = {14, 14, 14, 14, 14,
+                                                       14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_dct_adst_16,
+    .stage_range_col = inv_stage_range_col_dct_adst_16,
+    .stage_range_row = inv_stage_range_row_dct_adst_16,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_16,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_16,
+    .txfm_func_col = vp10_idct16_new,
+    .txfm_func_row = vp10_iadst16_new};
+
+//  ---------------- config inv_dct_adst_32 ----------------
+static const int8_t inv_shift_dct_adst_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_dct_adst_32[10] = {18, 18, 18, 18, 18,
+                                                           18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_adst_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_adst_32[10] = {14, 14, 14, 14, 14,
+                                                       14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_adst_32[12] = {13, 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 12,
+
+    .shift = inv_shift_dct_adst_32,
+    .stage_range_col = inv_stage_range_col_dct_adst_32,
+    .stage_range_row = inv_stage_range_row_dct_adst_32,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_32,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_32,
+    .txfm_func_col = vp10_idct32_new,
+    .txfm_func_row = vp10_iadst32_new};
+
+//  ---------------- config inv_adst_adst_4 ----------------
+static const int8_t inv_shift_adst_adst_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_adst_adst_4[6] = {16, 16, 16,
+                                                          16, 15, 15};
+static const int8_t inv_stage_range_row_adst_adst_4[6] = {16, 16, 16,
+                                                          16, 16, 16};
+static const int8_t inv_cos_bit_col_adst_adst_4[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_adst_4[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_adst_adst_4,
+    .stage_range_col = inv_stage_range_col_adst_adst_4,
+    .stage_range_row = inv_stage_range_row_adst_adst_4,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_4,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_4,
+    .txfm_func_col = vp10_iadst4_new,
+    .txfm_func_row = vp10_iadst4_new};
+
+//  ---------------- config inv_adst_adst_8 ----------------
+static const int8_t inv_shift_adst_adst_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_adst_adst_8[8] = {16, 16, 16, 16,
+                                                          16, 16, 15, 15};
+static const int8_t inv_stage_range_row_adst_adst_8[8] = {17, 17, 17, 17,
+                                                          17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_adst_adst_8[8] = {15, 15, 15, 15,
+                                                      15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_adst_8[8] = {15, 15, 15, 15,
+                                                      15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_adst_adst_8,
+    .stage_range_col = inv_stage_range_col_adst_adst_8,
+    .stage_range_row = inv_stage_range_row_adst_adst_8,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_8,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_8,
+    .txfm_func_col = vp10_iadst8_new,
+    .txfm_func_row = vp10_iadst8_new};
+
+//  ---------------- config inv_adst_adst_16 ----------------
+static const int8_t inv_shift_adst_adst_16[2] = {0, -6};
+static const int8_t inv_stage_range_col_adst_adst_16[10] = {18, 18, 18, 18, 18,
+                                                            18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_adst_16[10] = {18, 18, 18, 18, 18,
+                                                            18, 18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_adst_16[10] = {14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_adst_16[10] = {14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_adst_adst_16,
+    .stage_range_col = inv_stage_range_col_adst_adst_16,
+    .stage_range_row = inv_stage_range_row_adst_adst_16,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_16,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_16,
+    .txfm_func_col = vp10_iadst16_new,
+    .txfm_func_row = vp10_iadst16_new};
+
+//  ---------------- config inv_adst_adst_32 ----------------
+static const int8_t inv_shift_adst_adst_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_adst_adst_32[12] = {
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_adst_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_adst_32[12] = {14, 14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_adst_32[12] = {13, 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 12,
+
+    .shift = inv_shift_adst_adst_32,
+    .stage_range_col = inv_stage_range_col_adst_adst_32,
+    .stage_range_row = inv_stage_range_row_adst_adst_32,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_32,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_32,
+    .txfm_func_col = vp10_iadst32_new,
+    .txfm_func_row = vp10_iadst32_new};
+
+//  ---------------- config inv_adst_dct_4 ----------------
+static const int8_t inv_shift_adst_dct_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_adst_dct_4[6] = {17, 17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_adst_dct_4[4] = {16, 16, 16, 16};
+static const int8_t inv_cos_bit_col_adst_dct_4[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_4[4] = {15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 4,
+
+    .shift = inv_shift_adst_dct_4,
+    .stage_range_col = inv_stage_range_col_adst_dct_4,
+    .stage_range_row = inv_stage_range_row_adst_dct_4,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_4,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_4,
+    .txfm_func_col = vp10_iadst4_new,
+    .txfm_func_row = vp10_idct4_new};
+
+//  ---------------- config inv_adst_dct_8 ----------------
+static const int8_t inv_shift_adst_dct_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_adst_dct_8[8] = {16, 16, 16, 16,
+                                                   16, 16, 15, 15};
+static const int8_t inv_stage_range_row_adst_dct_8[6] = {17, 17, 17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_adst_dct_8[8] = {15, 15, 15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_8[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_adst_dct_8,
+    .stage_range_col = inv_stage_range_col_adst_dct_8,
+    .stage_range_row = inv_stage_range_row_adst_dct_8,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_8,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_8,
+    .txfm_func_col = vp10_iadst8_new,
+    .txfm_func_row = vp10_idct8_new};
+
+//  ---------------- config inv_adst_dct_16 ----------------
+static const int8_t inv_shift_adst_dct_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_dct_16[10] = {17, 17, 17, 17, 17,
+                                                     17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_adst_dct_16[8] = {18, 18, 18, 18,
+                                                    18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_dct_16[10] = {15, 15, 15, 15, 15,
+                                                 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_16[8] = {14, 14, 14, 14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_adst_dct_16,
+    .stage_range_col = inv_stage_range_col_adst_dct_16,
+    .stage_range_row = inv_stage_range_row_adst_dct_16,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_16,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_16,
+    .txfm_func_col = vp10_iadst16_new,
+    .txfm_func_row = vp10_idct16_new};
+
+//  ---------------- config inv_adst_dct_32 ----------------
+static const int8_t inv_shift_adst_dct_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_adst_dct_32[12] = {18, 18, 18, 18, 18, 18,
+                                                     18, 18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_dct_32[10] = {19, 19, 19, 19, 19,
+                                                     19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_dct_32[12] = {14, 14, 14, 14, 14, 14,
+                                                 14, 14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_dct_32[10] = {13, 13, 13, 13, 13,
+                                                 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_adst_dct_32,
+    .stage_range_col = inv_stage_range_col_adst_dct_32,
+    .stage_range_row = inv_stage_range_row_adst_dct_32,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_32,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_32,
+    .txfm_func_col = vp10_iadst32_new,
+    .txfm_func_row = vp10_idct32_new};
+
+#endif  // VP10_INV_TXFM2D_CFG_H_
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index cc0f3f0..a8868d4 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -346,18 +346,16 @@
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   FRAME_COUNTS *counts = xd->counts;
+
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
   if (mbmi->mode == DC_PRED) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
         vpx_read(r, cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
-      if (DR_ONLY ? 1 : vpx_read(r, DR_EXT_INTRA_PROB)) {
-        mbmi->ext_intra_mode_info.ext_intra_mode[0] = EXT_DR_PRED;
-        mbmi->ext_intra_mode_info.ext_intra_angle[0] =
-            read_uniform(r, EXT_INTRA_ANGLES);
-      } else {
-        mbmi->ext_intra_mode_info.ext_intra_mode[0] =
-            read_uniform(r, FILTER_INTRA_MODES);
-      }
+      mbmi->ext_intra_mode_info.ext_intra_mode[0] =
+          read_uniform(r, FILTER_INTRA_MODES);
     }
     if (counts)
       ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
@@ -366,14 +364,8 @@
     mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
         vpx_read(r, cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
-      if (DR_ONLY ? 1 : vpx_read(r, DR_EXT_INTRA_PROB)) {
-        mbmi->ext_intra_mode_info.ext_intra_mode[1] = EXT_DR_PRED;
-        mbmi->ext_intra_mode_info.ext_intra_angle[1] =
-            read_uniform(r, EXT_INTRA_ANGLES);
-      } else {
-        mbmi->ext_intra_mode_info.ext_intra_mode[1] =
-            read_uniform(r, FILTER_INTRA_MODES);
-      }
+      mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+          read_uniform(r, FILTER_INTRA_MODES);
     }
     if (counts)
       ++counts->ext_intra[1][mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
@@ -426,9 +418,20 @@
     default:
       mbmi->mode = read_intra_mode(r,
           get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#if CONFIG_EXT_INTRA
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        mbmi->angle_delta[0] =
+            read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif
 
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
@@ -591,8 +594,6 @@
                                       cm->fc->switchable_interp_prob[ctx]);
   if (counts)
     ++counts->switchable_interp[ctx][type];
-  // printf("%d/%d -> %d, %d\n", cm->current_video_frame, cm->show_frame,
-  //        xd->mi[0]->mbmi.sb_type, xd->mi[0]->mbmi.interp_filter);
   return type;
 }
 
@@ -626,9 +627,22 @@
       break;
     default:
       mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[0] = 0;
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        mbmi->angle_delta[0] =
+            read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
+
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
 #if CONFIG_EXT_INTRA
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index beb3414..a7b1f24 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -520,19 +520,15 @@
 static void write_ext_intra_mode_info(const VP10_COMMON *const cm,
                                       const MB_MODE_INFO *const mbmi,
                                       vpx_writer *w) {
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
   if (mbmi->mode == DC_PRED) {
     vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
               cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
       EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[0];
-      int dr_mode = mode > FILTER_TM_PRED;
-      if (!DR_ONLY)
-        vpx_write(w, dr_mode, DR_EXT_INTRA_PROB);
-      if (dr_mode)
-        write_uniform(w, EXT_INTRA_ANGLES,
-                      mbmi->ext_intra_mode_info.ext_intra_angle[0]);
-      else
-        write_uniform(w, FILTER_INTRA_MODES, mode);
+      write_uniform(w, FILTER_INTRA_MODES, mode);
     }
   }
   if (mbmi->uv_mode == DC_PRED) {
@@ -540,14 +536,7 @@
               cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
       EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[1];
-      int dr_mode = mode > FILTER_TM_PRED;
-      if (!DR_ONLY)
-        vpx_write(w, dr_mode, DR_EXT_INTRA_PROB);
-      if (dr_mode)
-        write_uniform(w, EXT_INTRA_ANGLES,
-                      mbmi->ext_intra_mode_info.ext_intra_angle[1]);
-      else
-        write_uniform(w, FILTER_INTRA_MODES, mode);
+      write_uniform(w, FILTER_INTRA_MODES, mode);
     }
   }
 }
@@ -644,6 +633,12 @@
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
       write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_EXT_INTRA
+      if (mode != DC_PRED && mode != TM_PRED) {
+        write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                      MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+      }
+#endif  // CONFIG_EXT_INTRA
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -657,6 +652,11 @@
     }
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
 #if CONFIG_EXT_INTRA
+    if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+        bsize >= BLOCK_8X8)
+      write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                    MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+
     if (bsize >= BLOCK_8X8)
       write_ext_intra_mode_info(cm, mbmi, w);
 #endif  // CONFIG_EXT_INTRA
@@ -782,6 +782,11 @@
   if (bsize >= BLOCK_8X8) {
     write_intra_mode(w, mbmi->mode,
                      get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#if CONFIG_EXT_INTRA
+    if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+      write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                    MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+#endif  // CONFIG_EXT_INTRA
   } else {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -797,6 +802,12 @@
   }
 
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+#endif  // CONFIG_EXT_INTRA
 
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
       mbmi->mode == DC_PRED)
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index adcd547..5c447b2 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -912,9 +912,9 @@
       if (cpi->sf.tx_size_search_breakout &&
           (rd == INT64_MAX ||
 #if CONFIG_EXT_TX
-           (s == 1 && tx_type != DCT_DCT) ||
+           (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
 #else
-           (s == 1) ||
+           (s == 1 && n < start_tx) ||
 #endif
            (n < (int) max_tx_size && rd > last_rd)))
         break;
@@ -1507,10 +1507,7 @@
   MB_MODE_INFO *mbmi = &mic->mbmi;
   int this_rate, this_rate_tokenonly, s;
   int ext_intra_selected_flag = 0;
-  int i, step, delta, angle, best_angle, best_angle_dir;
-  int deltas[3] = {25, 5, 1};
-  int branches[3] = {2, 2, 2};
-  int64_t this_distortion, this_rd, best_angle_rd = INT64_MAX;
+  int64_t this_distortion, this_rd;
   EXT_INTRA_MODE mode;
   TX_SIZE best_tx_size = TX_4X4;
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
@@ -1522,123 +1519,30 @@
   mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 1;
   mbmi->mode = DC_PRED;
 
-  if (!DR_ONLY) {
-    for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-      mbmi->ext_intra_mode_info.ext_intra_mode[0] = mode;
-      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                      &s, NULL, bsize, *best_rd);
-      if (this_rate_tokenonly == INT_MAX)
-        continue;
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[0] = mode;
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                    &s, NULL, bsize, *best_rd);
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
 
-      this_rate = this_rate_tokenonly +
-          vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
-          vp10_cost_bit(DR_EXT_INTRA_PROB, 0) +
-          write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
+        write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
-      if (this_rd < *best_rd) {
-        *best_rd            = this_rd;
-        best_tx_size        = mic->mbmi.tx_size;
-        ext_intra_mode_info = mbmi->ext_intra_mode_info;
+    if (this_rd < *best_rd) {
+      *best_rd            = this_rd;
+      best_tx_size        = mic->mbmi.tx_size;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
 #if CONFIG_EXT_TX
-        best_tx_type        = mic->mbmi.tx_type;
+      best_tx_type        = mic->mbmi.tx_type;
 #endif  // CONFIG_EXT_TX
-        *rate               = this_rate;
-        *rate_tokenonly     = this_rate_tokenonly;
-        *distortion         = this_distortion;
-        *skippable          = s;
-        ext_intra_selected_flag = 1;
-      }
-    }
-  }
-
-  mbmi->ext_intra_mode_info.ext_intra_mode[0] = EXT_DR_PRED;
-  if (ANGLE_FAST_SEARCH) {
-    best_angle = EXT_INTRA_ANGLES / 2;
-    for (step = 0; step < 3; ++step) {
-      delta = deltas[step];
-      for (i = -branches[step]; i <= branches[step]; ++i) {
-        int64_t rd_thresh;
-        if (i == 0 && step != 0)
-          continue;
-        angle = best_angle + i * delta;
-        if (angle < 0)
-          angle = 0;
-        if (angle >= EXT_INTRA_ANGLES)
-          angle = EXT_INTRA_ANGLES - 1;
-        if (angle == best_angle && step != 0)
-          continue;
-        mbmi->ext_intra_mode_info.ext_intra_angle[0] = angle;
-        if (*best_rd == INT64_MAX)
-          rd_thresh = best_angle_rd;
-        else
-          rd_thresh = VPXMIN(best_angle_rd, *best_rd * RD_ADJUSTER);
-        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                        &s, NULL, bsize, rd_thresh);
-        if (this_rate_tokenonly == INT_MAX)
-          continue;
-        this_rate = this_rate_tokenonly +
-            vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
-            (DR_ONLY ? 0: vp10_cost_bit(DR_EXT_INTRA_PROB, 1)) +
-            write_uniform_cost(EXT_INTRA_ANGLES, angle) + mode_cost;
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-        if (this_rd < *best_rd) {
-          *best_rd            = this_rd;
-          best_tx_size        = mic->mbmi.tx_size;
-          ext_intra_mode_info = mbmi->ext_intra_mode_info;
-#if CONFIG_EXT_TX
-          best_tx_type        = mic->mbmi.tx_type;
-#endif  // CONFIG_EXT_TX
-          *rate               = this_rate;
-          *rate_tokenonly     = this_rate_tokenonly;
-          *distortion         = this_distortion;
-          *skippable          = s;
-          ext_intra_selected_flag = 1;
-        }
-        if (this_rd < best_angle_rd) {
-          best_angle_rd = this_rd;
-          best_angle_dir = i;
-        }
-      }
-
-      best_angle += best_angle_dir * delta;
-      if (best_angle < 0)
-        best_angle = 0;
-      if (best_angle >= EXT_INTRA_ANGLES)
-        best_angle = EXT_INTRA_ANGLES - 1;
-      if (*best_rd < best_angle_rd / RD_ADJUSTER)
-        break;
-    }
-  } else {
-    for (angle = 0; angle < EXT_INTRA_ANGLES; ++angle) {
-      mbmi->ext_intra_mode_info.ext_intra_angle[0] = angle;
-      if (prediction_angle_map(angle) == 90 ||
-          prediction_angle_map(angle) == 180)
-        continue;
-      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                      &s, NULL, bsize, *best_rd);
-      if (this_rate_tokenonly == INT_MAX)
-        continue;
-
-      this_rate = this_rate_tokenonly +
-          vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
-          (DR_ONLY ? 0: vp10_cost_bit(DR_EXT_INTRA_PROB, 1)) +
-          write_uniform_cost(EXT_INTRA_ANGLES, angle) + mode_cost;
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-      if (this_rd < *best_rd) {
-        *best_rd            = this_rd;
-        best_tx_size        = mic->mbmi.tx_size;
-        ext_intra_mode_info = mbmi->ext_intra_mode_info;
-#if CONFIG_EXT_TX
-        best_tx_type        = mic->mbmi.tx_type;
-#endif  // CONFIG_EXT_TX
-        *rate               = this_rate;
-        *rate_tokenonly     = this_rate_tokenonly;
-        *distortion         = this_distortion;
-        *skippable          = s;
-        ext_intra_selected_flag = 1;
-      }
+      *rate               = this_rate;
+      *rate_tokenonly     = this_rate_tokenonly;
+      *distortion         = this_distortion;
+      *skippable          = s;
+      ext_intra_selected_flag = 1;
     }
   }
 
@@ -1649,8 +1553,6 @@
         ext_intra_mode_info.use_ext_intra_mode[0];
     mbmi->ext_intra_mode_info.ext_intra_mode[0] =
         ext_intra_mode_info.ext_intra_mode[0];
-    mbmi->ext_intra_mode_info.ext_intra_angle[0] =
-        ext_intra_mode_info.ext_intra_angle[0];
 #if CONFIG_EXT_TX
     mbmi->tx_type = best_tx_type;
 #endif  // CONFIG_EXT_TX
@@ -1659,6 +1561,132 @@
     return 0;
   }
 }
+
+static int64_t rd_pick_intra_angle_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, int rate_overhead,
+                                       int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
+  int64_t this_distortion, this_rd, sse_dummy;
+  TX_SIZE best_tx_size = mic->mbmi.tx_size;
+#if CONFIG_EXT_TX
+  TX_TYPE best_tx_type = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      mic->mbmi.angle_delta[0] = deltas_level1[i];
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize,
+                      (i == 0 && best_rd < INT64_MAX) ? best_rd * rd_adjust :
+                          best_rd);
+      if (this_rate_tokenonly == INT_MAX) {
+        if (i == 0)
+          break;
+        else
+          continue;
+      }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+        break;
+      if (this_rd < best_rd) {
+        best_i              = i;
+        best_rd             = this_rd;
+        best_angle_delta    = mbmi->angle_delta[0];
+        best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+        best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+        *rate               = this_rate;
+        *rate_tokenonly     = this_rate_tokenonly;
+        *distortion         = this_distortion;
+        *skippable          = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mic->mbmi.angle_delta[0] = deltas_level2[best_i][j];
+        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                        &s, NULL, bsize, best_rd);
+        if (this_rate_tokenonly == INT_MAX)
+          continue;
+        this_rate = this_rate_tokenonly + rate_overhead;
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (this_rd < best_rd) {
+          best_rd             = this_rd;
+          best_angle_delta    = mbmi->angle_delta[0];
+          best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+          best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+          *rate               = this_rate;
+          *rate_tokenonly     = this_rate_tokenonly;
+          *distortion         = this_distortion;
+          *skippable          = s;
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mic->mbmi.angle_delta[0] = angle_delta;
+
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+      if (this_rd < best_rd) {
+        best_rd             = this_rd;
+        best_angle_delta    = mbmi->angle_delta[0];
+        best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+        best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+        *rate               = this_rate;
+        *rate_tokenonly     = this_rate_tokenonly;
+        *distortion         = this_distortion;
+        *skippable          = s;
+      }
+    }
+  }
+
+  mbmi->tx_size = best_tx_size;
+  mbmi->angle_delta[0] = best_angle_delta;
+#if CONFIG_EXT_TX
+  mbmi->tx_type = best_tx_type;
+#endif  // CONFIG_EXT_TX
+
+  if (*rate_tokenonly < INT_MAX) {
+    txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                     cpi,
+#endif
+                     &this_rate_tokenonly, &this_distortion, &s,
+                     &sse_dummy, INT64_MAX, 0, bsize, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+  }
+
+  return best_rd;
+}
 #endif  // CONFIG_EXT_INTRA
 
 // This function is used only for intra_only frames
@@ -1676,6 +1704,7 @@
   TX_SIZE best_tx = TX_4X4;
 #if CONFIG_EXT_INTRA
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_EXT_TX
   TX_TYPE best_tx_type = DCT_DCT;
@@ -1696,6 +1725,7 @@
 #if CONFIG_EXT_INTRA
   ext_intra_mode_info.use_ext_intra_mode[0] = 0;
   mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.angle_delta[0] = 0;
 #endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   palette_mode_info.palette_size[0] = 0;
@@ -1708,9 +1738,24 @@
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     mic->mbmi.mode = mode;
-
+#if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
+    if (is_directional_mode) {
+    rate_overhead = bmode_costs[mode] +
+        write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+    this_rate_tokenonly = INT_MAX;
+    this_rd =
+        rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rate_tokenonly,
+                                &this_distortion, &s, bsize, rate_overhead,
+                                best_rd);
+    } else {
+      mic->mbmi.angle_delta[0] = 0;
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, best_rd);
+    }
+#endif  // CONFIG_EXT_INTRA
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, NULL, bsize, best_rd);
+                    &s, NULL, bsize, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1721,8 +1766,12 @@
           vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
                                                          [palette_ctx], 0);
 #if CONFIG_EXT_INTRA
-    if (mode == DC_PRED)
+    if (mode == DC_PRED && ALLOW_FILTER_INTRA_MODES)
       this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 0);
+    if (is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mic->mbmi.angle_delta[0]);
 #endif  // CONFIG_EXT_INTRA
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
@@ -1730,6 +1779,9 @@
       mode_selected   = mode;
       best_rd         = this_rd;
       best_tx         = mic->mbmi.tx_size;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mic->mbmi.angle_delta[0];
+#endif  // CONFIG_EXT_INTRA
 #if CONFIG_EXT_TX
       best_tx_type    = mic->mbmi.tx_type;
 #endif  // CONFIG_EXT_TX
@@ -1746,7 +1798,7 @@
                               &best_tx, &mode_selected, &best_rd);
 
 #if CONFIG_EXT_INTRA
-  if (!palette_mode_info.palette_size[0] > 0) {
+  if (!palette_mode_info.palette_size[0] > 0 && ALLOW_FILTER_INTRA_MODES) {
     if (rd_pick_ext_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                               skippable, bsize, bmode_costs[DC_PRED],
                               &best_rd)) {
@@ -1764,13 +1816,14 @@
   if (ext_intra_mode_info.use_ext_intra_mode[0]) {
     mic->mbmi.ext_intra_mode_info.ext_intra_mode[0] =
         ext_intra_mode_info.ext_intra_mode[0];
-    mic->mbmi.ext_intra_mode_info.ext_intra_angle[0] =
-        ext_intra_mode_info.ext_intra_angle[0];
   }
 #endif  // CONFIG_EXT_INTRA
 
   mic->mbmi.mode = mode_selected;
   mic->mbmi.tx_size = best_tx;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.angle_delta[0] = best_angle_delta;
+#endif  // CONFIG_EXT_INTRA
 #if CONFIG_EXT_TX
   mic->mbmi.tx_type = best_tx_type;
 #endif  // CONFIG_EXT_TX
@@ -2468,127 +2521,38 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int ext_intra_selected_flag = 0;
   int this_rate_tokenonly, this_rate, s;
-  int64_t this_distortion, this_sse, this_rd, best_angle_rd = INT64_MAX;
+  int64_t this_distortion, this_sse, this_rd;
   EXT_INTRA_MODE mode;
-  int i, step, delta, angle, best_angle, best_angle_dir;
-  int deltas[3] = {25, 5, 1};
-  int branches[3] = {2, 2, 2};
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
 
   vp10_zero(ext_intra_mode_info);
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 1;
   mbmi->uv_mode = DC_PRED;
 
-  if (!DR_ONLY) {
-    for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
-      mbmi->ext_intra_mode_info.ext_intra_mode[1] = mode;
-      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                            &this_distortion, &s, &this_sse, bsize, *best_rd))
-        continue;
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] = mode;
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, *best_rd))
+      continue;
 
-      this_rate = this_rate_tokenonly +
-          vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
-          vp10_cost_bit(DR_EXT_INTRA_PROB, 0) +
-          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
-          write_uniform_cost(FILTER_INTRA_MODES, mode);
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-      if (this_rd < *best_rd) {
-        *best_rd        = this_rd;
-        *rate           = this_rate;
-        *rate_tokenonly = this_rate_tokenonly;
-        *distortion     = this_distortion;
-        *skippable      = s;
-        ext_intra_mode_info = mbmi->ext_intra_mode_info;
-        ext_intra_selected_flag = 1;
-        if (!x->select_tx_size)
-          swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
-      }
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
+        cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+        write_uniform_cost(FILTER_INTRA_MODES, mode);
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    if (this_rd < *best_rd) {
+      *best_rd        = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+      ext_intra_selected_flag = 1;
+      if (!x->select_tx_size)
+        swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
     }
   }
 
-  mbmi->ext_intra_mode_info.ext_intra_mode[1] = EXT_DR_PRED;
-  if (ANGLE_FAST_SEARCH) {
-    best_angle = EXT_INTRA_ANGLES / 2;
-    for (step = 0; step < 3; ++step) {
-      delta = deltas[step];
-      for (i = -branches[step]; i <= branches[step]; ++i) {
-        int64_t rd_thresh;
-        if (i == 0 && step != 0)
-          continue;
-        angle = best_angle + i * delta;
-        if (angle < 0)
-          angle = 0;
-        if (angle >= EXT_INTRA_ANGLES)
-          angle = EXT_INTRA_ANGLES - 1;
-        if (angle == best_angle && step != 0)
-          continue;
-        mbmi->ext_intra_mode_info.ext_intra_angle[1] = angle;
-        if (*best_rd == INT64_MAX)
-          rd_thresh = best_angle_rd;
-        else
-          rd_thresh = VPXMIN(best_angle_rd, *best_rd * RD_ADJUSTER);
-        if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-                              &s, &this_sse, bsize, rd_thresh))
-          continue;
-        this_rate = this_rate_tokenonly +
-            vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
-            (DR_ONLY ? 0: vp10_cost_bit(DR_EXT_INTRA_PROB, 1)) +
-            cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
-            write_uniform_cost(EXT_INTRA_ANGLES, angle);
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-        if (this_rd < *best_rd) {
-          *best_rd        = this_rd;
-          *rate           = this_rate;
-          *rate_tokenonly = this_rate_tokenonly;
-          *distortion     = this_distortion;
-          *skippable      = s;
-          ext_intra_mode_info = mbmi->ext_intra_mode_info;
-          ext_intra_selected_flag = 1;
-          if (!x->select_tx_size)
-            swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
-        }
-        if (this_rd < best_angle_rd) {
-          best_angle_rd = this_rd;
-          best_angle_dir = i;
-        }
-      }
-      best_angle += best_angle_dir * delta;
-      if (best_angle < 0)
-        best_angle = 0;
-      if (best_angle >= EXT_INTRA_ANGLES)
-        best_angle = EXT_INTRA_ANGLES - 1;
-      if (*best_rd < best_angle_rd / RD_ADJUSTER)
-        break;
-    }
-  } else {
-    for (angle = 0; angle < EXT_INTRA_ANGLES; ++angle) {
-      mbmi->ext_intra_mode_info.ext_intra_angle[1] = angle;
-      if (prediction_angle_map(angle) == 90 ||
-          prediction_angle_map(angle) == 180)
-        continue;
-      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                            &this_distortion, &s, &this_sse, bsize, *best_rd))
-        continue;
-
-      this_rate = this_rate_tokenonly +
-          vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
-          (DR_ONLY ? 0: vp10_cost_bit(DR_EXT_INTRA_PROB, 1)) +
-          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
-          write_uniform_cost(EXT_INTRA_ANGLES, angle);
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-      if (this_rd < *best_rd) {
-        *best_rd        = this_rd;
-        *rate           = this_rate;
-        *rate_tokenonly = this_rate_tokenonly;
-        *distortion     = this_distortion;
-        *skippable      = s;
-        ext_intra_mode_info = mbmi->ext_intra_mode_info;
-        ext_intra_selected_flag = 1;
-        if (!x->select_tx_size)
-          swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
-      }
-    }
-  }
 
   if (ext_intra_selected_flag) {
     mbmi->uv_mode = DC_PRED;
@@ -2596,14 +2560,105 @@
         ext_intra_mode_info.use_ext_intra_mode[1];
     mbmi->ext_intra_mode_info.ext_intra_mode[1] =
         ext_intra_mode_info.ext_intra_mode[1];
-    mbmi->ext_intra_mode_info.ext_intra_angle[1] =
-        ext_intra_mode_info.ext_intra_angle[1];
-
     return 1;
   } else {
     return 0;
   }
 }
+
+static int rd_pick_intra_angle_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                    PICK_MODE_CONTEXT *ctx,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
+
+  (void)ctx;
+  *rate_tokenonly = INT_MAX;
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      mbmi->angle_delta[1] = deltas_level1[i];
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize,
+                            (i == 0 && best_rd < INT64_MAX) ?
+                                best_rd * rd_adjust : best_rd)) {
+        if (i == 0)
+          break;
+        else
+          continue;
+      }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+        break;
+      if (this_rd < best_rd) {
+        best_i           = i;
+        best_rd          = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate            = this_rate;
+        *rate_tokenonly  = this_rate_tokenonly;
+        *distortion      = this_distortion;
+        *skippable       = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mbmi->angle_delta[1] = deltas_level2[best_i][j];
+        if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                              &this_distortion, &s, &this_sse, bsize, best_rd))
+          continue;
+        this_rate = this_rate_tokenonly + rate_overhead;
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (this_rd < best_rd) {
+          best_rd          = this_rd;
+          best_angle_delta = mbmi->angle_delta[1];
+          *rate            = this_rate;
+          *rate_tokenonly  = this_rate_tokenonly;
+          *distortion      = this_distortion;
+          *skippable       = s;
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mbmi->angle_delta[1] = angle_delta;
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize, best_rd))
+        continue;
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (this_rd < best_rd) {
+        best_rd          = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate            = this_rate;
+        *rate_tokenonly  = this_rate_tokenonly;
+        *distortion      = this_distortion;
+        *skippable       = s;
+      }
+    }
+  }
+
+  mbmi->angle_delta[1] = best_angle_delta;
+  if (*rate_tokenonly != INT_MAX)
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                     &this_distortion, &s, &this_sse, bsize, INT_MAX);
+  return *rate_tokenonly != INT_MAX;
+}
 #endif  // CONFIG_EXT_INTRA
 
 static int64_t rd_pick_intra_sbuv_mode(VP10_COMP *cpi, MACROBLOCK *x,
@@ -2619,6 +2674,7 @@
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 #if CONFIG_EXT_INTRA
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
   EXT_INTRA_MODE_INFO ext_intra_mode_info;
 
   ext_intra_mode_info.use_ext_intra_mode[1] = 0;
@@ -2631,20 +2687,44 @@
       continue;
 
     mbmi->uv_mode = mode;
-
+#if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
+    rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+        write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+    mbmi->angle_delta[1] = 0;
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode) {
+      if (!rd_pick_intra_angle_sbuv(cpi, x, ctx, &this_rate,
+                                    &this_rate_tokenonly, &this_distortion, &s,
+                                    bsize, rate_overhead, best_rd))
+        continue;
+    } else {
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize, best_rd))
+        continue;
+    }
+    this_rate = this_rate_tokenonly +
+        cpi->intra_uv_mode_cost[mbmi->mode][mode];
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mbmi->angle_delta[1]);
+    if (mode == DC_PRED && 0)
+      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 0);
+#else
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
         cpi->intra_uv_mode_cost[xd->mi[0]->mbmi.mode][mode];
-#if CONFIG_EXT_INTRA
-    if (mode == DC_PRED)
-      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 0);
 #endif  // CONFIG_EXT_INTRA
+
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
       best_rd         = this_rd;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
@@ -2656,7 +2736,7 @@
   }
 
 #if CONFIG_EXT_INTRA
-  if (mbmi->sb_type >= BLOCK_8X8) {
+  if (mbmi->sb_type >= BLOCK_8X8 && ALLOW_FILTER_INTRA_MODES) {
     if (rd_pick_ext_intra_sbuv(cpi, x, ctx, rate, rate_tokenonly, distortion,
                                skippable, bsize, &best_rd)) {
       mode_selected   = mbmi->uv_mode;
@@ -2669,6 +2749,7 @@
   if (ext_intra_mode_info.use_ext_intra_mode[1])
     mbmi->ext_intra_mode_info.ext_intra_mode[1] =
         ext_intra_mode_info.ext_intra_mode[1];
+  mbmi->angle_delta[1] = best_angle_delta;
 #endif  // CONFIG_EXT_INTRA
   mbmi->uv_mode = mode_selected;
   return best_rd;
@@ -4478,6 +4559,9 @@
   PREDICTION_MODE mode_uv[TX_SIZES];
 #if CONFIG_EXT_INTRA
   EXT_INTRA_MODE_INFO ext_intra_mode_info_uv[TX_SIZES];
+  int8_t uv_angle_delta[TX_SIZES];
+  int is_directional_mode;
+  int rate_overhead, rate_dummy;
 #endif  // CONFIG_EXT_INTRA
   const int intra_cost_penalty = vp10_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
@@ -4762,17 +4846,31 @@
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
-                      NULL, bsize, best_rd);
+
 #if CONFIG_EXT_INTRA
+      is_directional_mode = (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED);
+      if (is_directional_mode) {
+        rate_overhead = write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0) +
+            cpi->mbmode_cost[mbmi->mode];
+        rate_y = INT_MAX;
+        this_rd =
+            rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                    &skippable, bsize, rate_overhead, best_rd);
+      } else {
+        mbmi->angle_delta[0] = 0;
+        super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                        NULL, bsize, best_rd);
+      }
+
       // TODO(huisu): ext-intra is turned off in lossless mode for now to
       // avoid a unit test failure
-      if (mbmi->mode == DC_PRED && !xd->lossless[mbmi->segment_id]) {
+      if (mbmi->mode == DC_PRED && !xd->lossless[mbmi->segment_id] &&
+          ALLOW_FILTER_INTRA_MODES) {
         MB_MODE_INFO mbmi_copy = *mbmi;
-        int rate_dummy;
 
         if (rate_y != INT_MAX) {
-          int this_rate = rate_y + cpi->mbmode_cost[mbmi->mode] +
+          int this_rate = rate_y +
+              cpi->mbmode_cost[mbmi->mode] +
               vp10_cost_bit(cm->fc->ext_intra_probs[0], 0);
           this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, distortion_y);
         } else {
@@ -4784,7 +4882,11 @@
                                    cpi->mbmode_cost[mbmi->mode], &this_rd))
           *mbmi = mbmi_copy;
       }
+#else
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      NULL, bsize, best_rd);
 #endif  // CONFIG_EXT_INTRA
+
       if (rate_y == INT_MAX)
         continue;
       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
@@ -4795,6 +4897,7 @@
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
 #if CONFIG_EXT_INTRA
         ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
 #endif  // CONFIG_EXT_INTRA
       }
 
@@ -4803,32 +4906,29 @@
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 #if CONFIG_EXT_INTRA
+      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
       mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
           ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
       if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
         mbmi->ext_intra_mode_info.ext_intra_mode[1] =
             ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
-        mbmi->ext_intra_mode_info.ext_intra_angle[1] =
-            ext_intra_mode_info_uv[uv_tx].ext_intra_angle[1];
       }
 #endif  // CONFIG_EXT_INTRA
 
       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
 #if CONFIG_EXT_INTRA
-      if (mbmi->mode == DC_PRED) {
+      if (is_directional_mode)
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS +
+                                    mbmi->angle_delta[0]);
+
+      if (mbmi->mode == DC_PRED && ALLOW_FILTER_INTRA_MODES) {
         rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
                                mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
         if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
           EXT_INTRA_MODE ext_intra_mode =
               mbmi->ext_intra_mode_info.ext_intra_mode[0];
-          int angle = mbmi->ext_intra_mode_info.ext_intra_angle[0];
-          if (!DR_ONLY)
-              rate2 += vp10_cost_bit(DR_EXT_INTRA_PROB,
-                                     ext_intra_mode > FILTER_TM_PRED);
-          if (ext_intra_mode > FILTER_TM_PRED)
-            rate2 += write_uniform_cost(EXT_INTRA_ANGLES, angle);
-          else
-            rate2 += write_uniform_cost(FILTER_INTRA_MODES, ext_intra_mode);
+          rate2 += write_uniform_cost(FILTER_INTRA_MODES, ext_intra_mode);
         }
       }
 #endif  // CONFIG_EXT_INTRA
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 4b7a784..461815c 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -71,6 +71,7 @@
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.c
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d_cfg.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
 
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5bf71ef..f41ee09 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -312,7 +312,7 @@
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad/;
+specialize qw/vp9_diamond_search_sad avx/;
 
 add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_full_range_search/;
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 4e88819..f5da07e 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -243,7 +243,7 @@
     decrease_ref_count(old_idx, frame_bufs, pool);
 
     // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
+    if (mask & 1) {
       decrease_ref_count(old_idx, frame_bufs, pool);
     }
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
@@ -350,7 +350,7 @@
         decrease_ref_count(old_idx, frame_bufs, pool);
 
         // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
+        if (mask & 1) {
           decrease_ref_count(old_idx, frame_bufs, pool);
         }
         ++ref_index;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 4a5188f..afa4009 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -128,7 +128,7 @@
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
-  if (idx >= 0) {
+  if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
     --frame_bufs[idx].ref_count;
     // A worker may only get a free framebuffer index when calling get_free_fb.
     // But the private buffer is not set up until finish decoding header.
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 0791677..2270a06 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -411,8 +411,11 @@
   assert(cr->sb_index < sbs_in_frame);
   i = cr->sb_index;
   cr->target_num_seg_blocks = 0;
-  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
     consec_zero_mv_thresh = 100;
+   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
+     consec_zero_mv_thresh = 80;
+  }
   qindex_thresh =
       cpi->oxcf.content == VP9E_CONTENT_SCREEN
       ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4615554..1a14ea9 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -123,72 +123,66 @@
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth) {
-  TOKENEXTRA *p = *tp;
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int t = p->token;
-    const struct vp9_token *const a = &vp9_coef_encodings[t];
-    int i = 0;
-    int v = a->value;
-    int n = a->len;
+  const TOKENEXTRA *p;
+  const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-    const vp9_extra_bit *b;
-    if (bit_depth == VPX_BITS_12)
-      b = &vp9_extra_bits_high12[t];
-    else if (bit_depth == VPX_BITS_10)
-      b = &vp9_extra_bits_high10[t];
-    else
-      b = &vp9_extra_bits[t];
+    (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 :
+    (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 :
+    vp9_extra_bits;
 #else
-    const vp9_extra_bit *const b = &vp9_extra_bits[t];
+    vp9_extra_bits;
     (void) bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    /* skip one or two nodes */
-    if (p->skip_eob_node) {
-      n -= p->skip_eob_node;
-      i = 2 * p->skip_eob_node;
+  for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
+    if (p->token == EOB_TOKEN) {
+      vpx_write(w, 0, p->context_tree[0]);
+      continue;
     }
-
-    // TODO(jbb): expanding this can lead to big gains.  It allows
-    // much better branch prediction and would enable us to avoid numerous
-    // lookups and compares.
-
-    // If we have a token that's in the constrained set, the coefficient tree
-    // is split into two treed writes.  The first treed write takes care of the
-    // unconstrained nodes.  The second treed write takes care of the
-    // constrained nodes.
-    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
-      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
-      int bits = v >> (n - len);
-      vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i);
-      vp9_write_tree(w, vp9_coef_con_tree,
-                     vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                     v, n - len, 0);
-    } else {
-      vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i);
-    }
-
-    if (b->base_val) {
-      const int e = p->extra, l = b->len;
-
-      if (l) {
-        const unsigned char *pb = b->prob;
-        int v = e >> 1;
-        int n = l;              /* number of bits in v, assumed nonzero */
-
-        do {
-          const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, *pb++);
-        } while (n);
+    vpx_write(w, 1, p->context_tree[0]);
+    while (p->token == ZERO_TOKEN) {
+      vpx_write(w, 0, p->context_tree[1]);
+      ++p;
+      if (p == stop || p->token == EOSB_TOKEN) {
+        *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
+        return;
       }
-
-      vpx_write_bit(w, e & 1);
     }
-    ++p;
-  }
 
-  *tp = p + (p->token == EOSB_TOKEN);
+    {
+      const int t = p->token;
+      const vpx_prob *const context_tree = p->context_tree;
+      assert(t != ZERO_TOKEN);
+      assert(t != EOB_TOKEN);
+      assert(t != EOSB_TOKEN);
+      vpx_write(w, 1, context_tree[1]);
+      if (t == ONE_TOKEN) {
+        vpx_write(w, 0, context_tree[2]);
+        vpx_write_bit(w, p->extra & 1);
+      } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
+        const struct vp9_token *const a = &vp9_coef_encodings[t];
+        const int v = a->value;
+        const int n = a->len;
+        const int e = p->extra;
+        vpx_write(w, 1, context_tree[2]);
+        vp9_write_tree(w, vp9_coef_con_tree,
+                       vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
+                       n - UNCONSTRAINED_NODES, 0);
+        if (t >= CATEGORY1_TOKEN) {
+          const vp9_extra_bit *const b = &extra_bits[t];
+          const unsigned char *pb = b->prob;
+          int v = e >> 1;
+          int n = b->len;  // number of bits in v, assumed nonzero
+          do {
+            const int bb = (v >> --n) & 1;
+            vpx_write(w, bb, *pb++);
+          } while (n);
+        }
+        vpx_write_bit(w, e & 1);
+      }
+    }
+  }
+  *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
 }
 
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index c592832..c382b77 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -323,7 +323,7 @@
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
 
-  if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenMedium) {
+  if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenLow) {
     // Take center pixel in block to determine is_skin.
     const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
     const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
@@ -349,7 +349,7 @@
     denoiser->increase_denoising = 0;
   }
 
-  if (denoiser->denoising_level >= kDenMedium)
+  if (denoiser->denoising_level >= kDenLow)
     decision = perform_motion_compensation(denoiser, mb, bs,
                                            denoiser->increase_denoising,
                                            mi_row, mi_col, ctx,
@@ -524,6 +524,7 @@
 #endif
   denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
+  denoiser->denoising_level = kDenLow;
   return 0;
 }
 
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 8bed9e8..bc676e9 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -27,6 +27,7 @@
 } VP9_DENOISER_DECISION;
 
 typedef enum vp9_denoiser_level {
+  kDenLowLow,
   kDenLow,
   kDenMedium,
   kDenHigh
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 9d66839..0475883 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -492,14 +492,14 @@
     // Increase base variance threshold if estimated noise level is high.
     if (cpi->noise_estimate.enabled) {
       if (cpi->noise_estimate.level == kHigh)
-        threshold_base = threshold_base << 2;
+        threshold_base = 3 * threshold_base;
       else
         if (cpi->noise_estimate.level == kMedium)
           threshold_base = threshold_base << 1;
     }
-    thresholds[1] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
+      thresholds[0] = threshold_base >> 3;
+      thresholds[1] = threshold_base >> 1;
       thresholds[2] = threshold_base << 3;
     } else {
       thresholds[0] = threshold_base;
@@ -526,7 +526,7 @@
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
       if (cm->width <= 352 && cm->height <= 288)
-        cpi->vbp_threshold_sad = 100;
+        cpi->vbp_threshold_sad = 10;
       else
         cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
             (cpi->y_dequant[q][1] << 1) : 1000;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index d86a7a7..eebd7c5 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1570,7 +1570,30 @@
 #endif
 #define log2f(x) (log (x) / (float) M_LOG2_E)
 
+/***********************************************************************
+ * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
+ ***********************************************************************
+ * The following 2 functions ('cal_nmvjointsadcost' and                *
+ * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
+ * used by 'vp9_diamond_search_sad'. The C implementation of the       *
+ * function is generic, but the AVX intrinsics optimised version       *
+ * relies on the following properties of the computed tables:          *
+ * For cal_nmvjointsadcost:                                            *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
+ * For cal_nmvsadcosts:                                                *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                   *
+ *         (Equal costs for both components)                           *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
+ *         (Cost function is even)                                     *
+ * If these do not hold, then the AVX optimised version of the         *
+ * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
+ * case you can revert to using the C function instead.                *
+ ***********************************************************************/
+
 static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
   mvjointsadcost[0] = 600;
   mvjointsadcost[1] = 300;
   mvjointsadcost[2] = 300;
@@ -1578,6 +1601,9 @@
 }
 
 static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
   int i = 1;
 
   mvsadcost[0][0] = 0;
@@ -1739,6 +1765,10 @@
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
+  /*********************************************************************
+   * Warning: Read the comments around 'cal_nmvjointsadcost' and       *
+   * 'cal_nmvsadcosts' before modifying how these tables are computed. *
+   *********************************************************************/
   cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
   cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
   cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9744e43..b9a104a 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -101,11 +101,8 @@
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
-
-  cfg->ss_mv[0].col = 0;
-  cfg->ss_mv[0].row = 0;
-  cfg->ss_os[0] = 0;
+  int len;
+  int ss_count = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 4 search sites per step.
@@ -117,16 +114,13 @@
     }
   }
 
-  cfg->ss_count = ss_count;
   cfg->searches_per_step = 4;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
 void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
-
-  cfg->ss_mv[0].col = 0;
-  cfg->ss_mv[0].row = 0;
-  cfg->ss_os[0] = 0;
+  int len;
+  int ss_count = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 8 search sites per step.
@@ -141,8 +135,8 @@
     }
   }
 
-  cfg->ss_count = ss_count;
   cfg->searches_per_step = 8;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
 /*
@@ -1612,8 +1606,8 @@
   const uint8_t *best_address;
 
   unsigned int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
+  int best_site = -1;
+  int last_site = -1;
 
   int ref_row;
   int ref_col;
@@ -1626,7 +1620,7 @@
 //  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
   const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
   const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
@@ -1644,7 +1638,7 @@
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
-  i = 1;
+  i = 0;
 
   for (step = 0; step < tot_steps; step++) {
     int all_in = 1, t;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e48259f..1c101f2 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -33,10 +33,10 @@
 
 typedef struct search_site_config {
   // motion search sites
-  MV  ss_mv[8 * MAX_MVSEARCH_STEPS + 1];        // Motion vector
-  intptr_t ss_os[8 * MAX_MVSEARCH_STEPS + 1];   // Offset
-  int ss_count;
+  MV  ss_mv[8 * MAX_MVSEARCH_STEPS];        // Motion vector
+  intptr_t ss_os[8 * MAX_MVSEARCH_STEPS];   // Offset
   int searches_per_step;
+  int total_steps;
 } search_site_config;
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 8ba7de7..b41ffd0 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -29,11 +29,14 @@
   ne->value = 0;
   ne->count = 0;
   ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
   if (width * height >= 1920 * 1080) {
     ne->thresh = 200;
   } else if (width * height >= 1280 * 720) {
     ne->thresh = 130;
   }
+  ne->num_frames_estimate = 20;
 }
 
 int enable_noise_estimation(VP9_COMP *const cpi) {
@@ -86,10 +89,9 @@
   // Estimate of noise level every frame_period frames.
   int frame_period = 10;
   int thresh_consec_zeromv = 8;
-  unsigned int thresh_sum_diff = 128;
+  unsigned int thresh_sum_diff = 100;
   unsigned int thresh_sum_spatial = (200 * 200) << 8;
   unsigned int thresh_spatial_var = (32 * 32) << 8;
-  int num_frames_estimate = 20;
   int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
@@ -100,11 +102,17 @@
   ne->enabled = enable_noise_estimation(cpi);
   if (!ne->enabled ||
       cm->current_video_frame % frame_period != 0 ||
-      last_source == NULL) {
+      last_source == NULL ||
+      ne->last_w != cm->width ||
+      ne->last_h != cm->height) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0)
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
     return;
   } else {
     int num_samples = 0;
@@ -127,6 +135,17 @@
     const int uv_width_shift = y_width_shift >> 1;
     const int uv_height_shift = y_height_shift >> 1;
     int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3))
+      frame_low_motion = 0;
     for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
       for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
         // 16x16 blocks, 1/4 sample of frame.
@@ -146,7 +165,8 @@
           const uint8_t vsource =
             src_v[uv_height_shift * src_uvstride + uv_width_shift];
           int is_skin = vp9_skin_pixel(ysource, usource, vsource);
-          if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+          if (frame_low_motion &&
+              cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
@@ -185,6 +205,8 @@
       src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
       src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
     }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
     // Update noise estimate if we have at a minimum number of block samples,
     // and avg_est > 0 (avg_est == 0 can happen if the application inputs
     // duplicate frames).
@@ -192,18 +214,21 @@
       // Normalize.
       avg_est = avg_est / num_samples;
       // Update noise estimate.
-      ne->value = (int)((3 * ne->value + avg_est) >> 2);
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
       ne->count++;
-      if (ne->count == num_frames_estimate) {
+      if (ne->count == ne->num_frames_estimate) {
         // Reset counter and check noise level condition.
+        ne->num_frames_estimate = 30;
         ne->count = 0;
         if (ne->value > (ne->thresh << 1))
           ne->level = kHigh;
         else
           if (ne->value > ne->thresh)
             ne->level = kMedium;
-          else
+          else if (ne->value > (ne->thresh >> 1))
             ne->level = kLow;
+          else
+            ne->level = kLowLow;
       }
     }
   }
diff --git a/vp9/encoder/vp9_noise_estimate.h b/vp9/encoder/vp9_noise_estimate.h
index 2acc2ea..0d22ef0 100644
--- a/vp9/encoder/vp9_noise_estimate.h
+++ b/vp9/encoder/vp9_noise_estimate.h
@@ -24,6 +24,7 @@
 #endif
 
 typedef enum noise_level {
+  kLowLow,
   kLow,
   kMedium,
   kHigh
@@ -35,6 +36,9 @@
   int value;
   int thresh;
   int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
 } NOISE_ESTIMATE;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 8c4782d..9db044f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1483,18 +1483,30 @@
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
-    // Bias against non-zero (above some threshold) motion for large blocks.
-    // This is temporary fix to avoid selection of large mv for big blocks.
-    if (cpi->oxcf.speed > 5 &&
-        cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
-        (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
-        frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
-        frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
-        frame_mv[this_mode][ref_frame].as_mv.col < -64)) {
-      if (bsize == BLOCK_64X64)
-        this_rdc.rdcost = this_rdc.rdcost << 1;
-      else if (bsize >= BLOCK_32X32)
-        this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
+    if (cpi->oxcf.speed >= 5 &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+      // Bias against non-zero (above some threshold) motion for large blocks.
+      // This is temporary fix to avoid selection of large mv for big blocks.
+      if (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
+          frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
+          frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
+          frame_mv[this_mode][ref_frame].as_mv.col < -64) {
+        if (bsize == BLOCK_64X64)
+          this_rdc.rdcost = this_rdc.rdcost << 1;
+        else if (bsize >= BLOCK_32X32)
+          this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
+      }
+      // If noise estimation is enabled, and estimated level is above threshold,
+      // add a bias to LAST reference with small motion, for large blocks.
+      if (cpi->noise_estimate.enabled &&
+          cpi->noise_estimate.level >= kMedium &&
+          bsize >= BLOCK_32X32 &&
+          ref_frame == LAST_FRAME &&
+          frame_mv[this_mode][ref_frame].as_mv.row < 8 &&
+          frame_mv[this_mode][ref_frame].as_mv.row > -8 &&
+          frame_mv[this_mode][ref_frame].as_mv.col < 8 &&
+          frame_mv[this_mode][ref_frame].as_mv.col > -8)
+        this_rdc.rdcost = 7 * this_rdc.rdcost >> 3;
     }
 
     // Skipping checking: test to see if this block can be reconstructed by
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index c6fe76c..0377cb5 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1873,7 +1873,7 @@
 
   // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 1 * cpi->framerate) {
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
     const int window = (int)(4 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
     if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
new file mode 100644
index 0000000..2ed3f1a
--- /dev/null
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+#endif
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+# define __likely__(v)    __builtin_expect(v, 1)
+# define __unlikely__(v)  __builtin_expect(v, 0)
+#else
+# define __likely__(v)    (v)
+# define __unlikely__(v)  (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv,
+                          const int *joint_cost, int *const comp_cost[2]) {
+  return joint_cost[get_mv_joint(mv)] +
+         comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int error_per_bit) {
+  const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
+                                  mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO(mv_cost(diff, x->nmvjointsadcost,
+                                    x->nmvsadcost) * error_per_bit, 8);
+}
+
+/*****************************************************************************
+ * This function utilises 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
+                               const search_site_config *cfg,
+                               MV *ref_mv, MV *best_mv, int search_param,
+                               int sad_per_bit, int *num00,
+                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv) {
+  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
+  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
+  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+
+  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
+
+  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
+  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
+                                        center_mv->col >> 3);
+  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
+                                 ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if ARCH_X86_64
+  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+  unsigned int best_sad;
+
+  int i;
+  int j;
+  int step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      __m128i v_sad_d;
+      __m128i v_cost_d;
+      __m128i v_outside_d;
+      __m128i v_inside_d;
+      __m128i v_diff_mv_w;
+#if ARCH_X86_64
+      __m128i v_blocka[2];
+#else
+      __m128i v_blocka[1];
+#endif
+
+      // Compute the candidate motion vectors
+      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
+      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      __m128i v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
+
+      // If none of them are inside, then move on
+      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if ARCH_X86_64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
+        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = _mm_and_si128(v_bo10_q,
+                                 _mm_cvtepi32_epi64(v_inside_d));
+        v_bo32_q = _mm_and_si128(v_bo32_q,
+                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+        // Compute the candidate addresses
+        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
+        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
+#else  // ARCH_X86 //  sizeof(intptr_t) == 4
+        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
+        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
+        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride,
+                     (const uint8_t **)&v_blocka[0], in_what_stride,
+                     (uint32_t*)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
+        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
+        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
+        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
+        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
+        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
+        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
+        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
+
+        // Note: This is a use case for vpgather in AVX2
+        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
+        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
+        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
+        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
+
+        __m128i v_cost_10_d, v_cost_32_d;
+
+        v_cost_10_d = _mm_cvtsi32_si128(cost0);
+        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
+
+        v_cost_32_d = _mm_cvtsi32_si128(cost2);
+        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
+
+        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
+      }
+
+      // Now add in the joint cost
+      {
+        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
+                                                _mm_setzero_si128());
+        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
+                                                       v_joint_cost_0_d,
+                                                       v_sel_d);
+        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, 8)
+      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
+      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
+      // Add the cost to the sad
+      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        // Try speculatively on 16 bits, so we can use the minpos intrinsic
+        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
+        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
+
+        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
+        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
+
+        // If the local best value is not saturated, just use it, otherwise
+        // find the horizontal minimum again the hard way on 32 bits.
+        // This is executed rarely.
+        if (__unlikely__(local_best_sad == 0xffff)) {
+          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+
+          v_loval_d = v_sad_d;
+          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+          v_hival_d = _mm_srli_si128(v_loval_d, 8);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+          v_hival_d = _mm_srli_si128(v_loval_d, 4);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+
+          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
+          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
+        }
+
+        // Update the global minimum if the local minimum is smaller
+        if (__likely__(local_best_sad < best_sad)) {
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = _mm_set1_epi32(bmv.as_int);
+#if ARCH_X86_64
+    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+    if (__unlikely__(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3f3bdef..5918240 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -96,6 +96,7 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index 9c5b414..abc0270 100644
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -13,15 +13,21 @@
 SECTION .text
 
 %macro convolve_fn 1-2
-INIT_XMM sse2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
-                                 fx, fxs, fy, fys, w, h, bd
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                              dst, dst_stride, \
+                                              fx, fxs, fy, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
-                              fx, fxs, fy, fys, w, h
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                           dst, dst_stride, \
+                                           fx, fxs, fy, fys, w, h
 %endif
   mov r4d, dword wm
 %ifidn %2, highbd
@@ -152,38 +158,11 @@
   jnz .loop16
   RET
 
-INIT_MMX sse
 .w8:
   mov                    r4d, dword hm
   lea                    r5q, [src_strideq*3]
   lea                    r6q, [dst_strideq*3]
 .loop8:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
   movh                    m0, [srcq]
   movh                    m1, [srcq+src_strideq]
   movh                    m2, [srcq+src_strideq*2]
@@ -205,11 +184,42 @@
   movh  [dstq+r6q          ], m3
   lea                   dstq, [dstq+dst_strideq*4]
   sub                    r4d, 4
+  jnz .loop8
+  RET
+
+%ifnidn %2, highbd
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movd                    m0, [srcq]
+  movd                    m1, [srcq+src_strideq]
+  movd                    m2, [srcq+src_strideq*2]
+  movd                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movd                    m4, [dstq]
+  movd                    m5, [dstq+dst_strideq]
+  movd                    m6, [dstq+dst_strideq*2]
+  movd                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movd  [dstq              ], m0
+  movd  [dstq+dst_strideq  ], m1
+  movd  [dstq+dst_strideq*2], m2
+  movd  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
   jnz .loop4
   RET
 %endif
 %endmacro
 
+INIT_XMM sse2
 convolve_fn copy
 convolve_fn avg
 %if CONFIG_VP9_HIGHBITDEPTH