Merge "seeing a 10x slowing down, revert now for investigation"
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 20b1c8f..92976e1 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -272,10 +272,18 @@
   vp9_fdct16x16_c(in, out, stride);
 }
 
+void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+  vp9_idct16x16_256_add_c(in, dest, stride);
+}
+
 void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_fht16x16_c(in, out, stride, tx_type);
 }
 
+void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+  vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
 class Trans16x16TestBase {
  public:
   virtual ~Trans16x16TestBase() {}
@@ -378,6 +386,47 @@
     }
   }
 
+  void RunQuantCheck(int dc_thred, int ac_thred) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+
+      // clear reconstructed pixel buffers
+      vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
+      vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+
+      // quantization with maximum allowed step sizes
+      output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
+      for (int j = 1; j < kNumCoeffs; ++j)
+        output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
+      inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(ref[j], dst[j]);
+    }
+  }
+
   void RunInvAccuracyCheck() {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
@@ -414,6 +463,7 @@
   int pitch_;
   int tx_type_;
   fht_t fwd_txfm_ref;
+  iht_t inv_txfm_ref;
 };
 
 class Trans16x16DCT
@@ -428,6 +478,7 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 16;
     fwd_txfm_ref = fdct16x16_ref;
+    inv_txfm_ref = idct16x16_ref;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -455,6 +506,12 @@
   RunMemCheck();
 }
 
+TEST_P(Trans16x16DCT, QuantCheck) {
+  // Use maximally allowed quantization step sizes for DC and AC
+  // coefficients respectively.
+  RunQuantCheck(1336, 1828);
+}
+
 TEST_P(Trans16x16DCT, InvAccuracyCheck) {
   RunInvAccuracyCheck();
 }
@@ -471,6 +528,7 @@
     tx_type_  = GET_PARAM(2);
     pitch_    = 16;
     fwd_txfm_ref = fht16x16_ref;
+    inv_txfm_ref = iht16x16_ref;
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -498,6 +556,12 @@
   RunMemCheck();
 }
 
+TEST_P(Trans16x16HT, QuantCheck) {
+  // The encoder skips any non-DC intra prediction modes,
+  // when the quantization step size goes beyond 988.
+  RunQuantCheck(549, 988);
+}
+
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/sad_test.cc b/test/sad_test.cc
index adb191f..a692891 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -29,22 +29,12 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 
-#if CONFIG_VP8_ENCODER
 typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
                                         int source_stride,
                                         const unsigned char *reference_ptr,
                                         int reference_stride,
                                         unsigned int max_sad);
 typedef std::tr1::tuple<int, int, sad_m_by_n_fn_t> sad_m_by_n_test_param_t;
-#endif
-#if CONFIG_VP9_ENCODER
-typedef unsigned int (*sad_m_by_n_fn_vp9_t)(const unsigned char *source_ptr,
-                                            int source_stride,
-                                            const unsigned char *reference_ptr,
-                                            int reference_stride);
-typedef std::tr1::tuple<int, int, sad_m_by_n_fn_vp9_t>
-                  sad_m_by_n_test_param_vp9_t;
-#endif
 
 typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,
                                      int src_stride,
@@ -97,7 +87,7 @@
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
   // difference between two pixels in the same relative location; accumulate.
-  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) {
+  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx = 0) {
     unsigned int sad = 0;
     const uint8_t* const reference = GetReference(block_idx);
 
@@ -138,9 +128,39 @@
   ACMRandom rnd_;
 };
 
-class SADx4Test
-    : public SADTestBase,
-      public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
+class SADTest : public SADTestBase,
+    public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  unsigned int SAD(unsigned int max_sad, int block_idx = 0) {
+    unsigned int ret;
+    const uint8_t* const reference = GetReference(block_idx);
+
+    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                            reference, reference_stride_,
+                                            max_sad));
+    return ret;
+  }
+
+  void CheckSad(unsigned int max_sad) {
+    unsigned int reference_sad, exp_sad;
+
+    reference_sad = ReferenceSAD(max_sad);
+    exp_sad = SAD(max_sad);
+
+    if (reference_sad <= max_sad) {
+      ASSERT_EQ(exp_sad, reference_sad);
+    } else {
+      // Alternative implementations are not required to check max_sad
+      ASSERT_GE(exp_sad, reference_sad);
+    }
+  }
+};
+
+class SADx4Test : public SADTestBase,
+    public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
  public:
   SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -158,173 +178,23 @@
     unsigned int reference_sad, exp_sad[4];
 
     SADs(exp_sad);
-    for (int block = 0; block < 4; ++block) {
+    for (int block = 0; block < 4; block++) {
       reference_sad = ReferenceSAD(UINT_MAX, block);
 
-      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+      EXPECT_EQ(exp_sad[block], reference_sad) << "block " << block;
     }
   }
 };
 
-#if CONFIG_VP8_ENCODER
-class SADTest
-    : public SADTestBase,
-      public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
- public:
-  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
-
- protected:
-  unsigned int SAD(unsigned int max_sad, int block_idx) {
-    unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
-
-    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                            reference, reference_stride_,
-                                            max_sad));
-    return ret;
-  }
-
-  void CheckSAD(unsigned int max_sad) {
-    unsigned int reference_sad, exp_sad;
-
-    reference_sad = ReferenceSAD(max_sad, 0);
-    exp_sad = SAD(max_sad, 0);
-
-    if (reference_sad <= max_sad) {
-      ASSERT_EQ(exp_sad, reference_sad);
-    } else {
-      // Alternative implementations are not required to check max_sad
-      ASSERT_GE(exp_sad, reference_sad);
-    }
-  }
-};
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-class SADVP9Test
-    : public SADTestBase,
-      public ::testing::WithParamInterface<sad_m_by_n_test_param_vp9_t> {
- public:
-  SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
-
- protected:
-  unsigned int SAD(int block_idx) {
-    unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
-
-    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                            reference, reference_stride_));
-    return ret;
-  }
-
-  void CheckSAD() {
-    unsigned int reference_sad, exp_sad;
-
-    reference_sad = ReferenceSAD(UINT_MAX, 0);
-    exp_sad = SAD(0);
-
-    ASSERT_EQ(reference_sad, exp_sad);
-  }
-};
-#endif  // CONFIG_VP9_ENCODER
-
 uint8_t* SADTestBase::source_data_ = NULL;
 uint8_t* SADTestBase::reference_data_ = NULL;
 
-#if CONFIG_VP8_ENCODER
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, 255);
-  CheckSAD(UINT_MAX);
+  CheckSad(UINT_MAX);
 }
 
-TEST_P(SADTest, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSAD(UINT_MAX);
-}
-
-TEST_P(SADTest, ShortRef) {
-  int tmp_stride = reference_stride_;
-  reference_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
-  reference_stride_ = tmp_stride;
-}
-
-TEST_P(SADTest, UnalignedRef) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  const int tmp_stride = reference_stride_;
-  reference_stride_ -= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
-  reference_stride_ = tmp_stride;
-}
-
-TEST_P(SADTest, ShortSrc) {
-  const int tmp_stride = source_stride_;
-  source_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD(UINT_MAX);
-  source_stride_ = tmp_stride;
-}
-
-TEST_P(SADTest, MaxSAD) {
-  // Verify that, when max_sad is set, the implementation does not return a
-  // value lower than the reference.
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSAD(128);
-}
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-TEST_P(SADVP9Test, MaxRef) {
-  FillConstant(source_data_, source_stride_, 0);
-  FillConstant(reference_data_, reference_stride_, 255);
-  CheckSAD();
-}
-
-TEST_P(SADVP9Test, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSAD();
-}
-
-TEST_P(SADVP9Test, ShortRef) {
-  const int tmp_stride = reference_stride_;
-  reference_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD();
-  reference_stride_ = tmp_stride;
-}
-
-TEST_P(SADVP9Test, UnalignedRef) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  const int tmp_stride = reference_stride_;
-  reference_stride_ -= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD();
-  reference_stride_ = tmp_stride;
-}
-
-TEST_P(SADVP9Test, ShortSrc) {
-  const int tmp_stride = source_stride_;
-  source_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSAD();
-  source_stride_ = tmp_stride;
-}
-#endif  // CONFIG_VP9_ENCODER
-
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, 255);
@@ -334,6 +204,12 @@
   CheckSADs();
 }
 
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(UINT_MAX);
+}
+
 TEST_P(SADx4Test, MaxSrc) {
   FillConstant(source_data_, source_stride_, 255);
   FillConstant(GetReference(0), reference_stride_, 0);
@@ -343,6 +219,15 @@
   CheckSADs();
 }
 
+TEST_P(SADTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
 TEST_P(SADx4Test, ShortRef) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
@@ -355,6 +240,17 @@
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
 TEST_P(SADx4Test, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
@@ -369,6 +265,15 @@
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADTest, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  source_stride_ = tmp_stride;
+}
+
 TEST_P(SADx4Test, ShortSrc) {
   int tmp_stride = source_stride_;
   source_stride_ >>= 1;
@@ -381,6 +286,14 @@
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADTest, MaxSAD) {
+  // Verify that, when max_sad is set, the implementation does not return a
+  // value lower than the reference.
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(128);
+}
+
 using std::tr1::make_tuple;
 
 //------------------------------------------------------------------------------
@@ -391,27 +304,27 @@
 const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
 const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
 const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
+#endif
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_64x64_c_vp9 = vp9_sad64x64_c;
+const sad_m_by_n_fn_t sad_32x32_c_vp9 = vp9_sad32x32_c;
+const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c;
+const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c;
+const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c;
+const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c;
+const sad_m_by_n_fn_t sad_8x4_c_vp9 = vp9_sad8x4_c;
+const sad_m_by_n_fn_t sad_4x8_c_vp9 = vp9_sad4x8_c;
+const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c;
+#endif
 const sad_m_by_n_test_param_t c_tests[] = {
+#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_c),
   make_tuple(8, 16, sad_8x16_c),
   make_tuple(16, 8, sad_16x8_c),
   make_tuple(8, 8, sad_8x8_c),
   make_tuple(4, 4, sad_4x4_c),
-};
-INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
-#endif  // CONFIG_VP8_ENCODER
-
+#endif
 #if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_vp9_t sad_64x64_c_vp9 = vp9_sad64x64_c;
-const sad_m_by_n_fn_vp9_t sad_32x32_c_vp9 = vp9_sad32x32_c;
-const sad_m_by_n_fn_vp9_t sad_16x16_c_vp9 = vp9_sad16x16_c;
-const sad_m_by_n_fn_vp9_t sad_8x16_c_vp9 = vp9_sad8x16_c;
-const sad_m_by_n_fn_vp9_t sad_16x8_c_vp9 = vp9_sad16x8_c;
-const sad_m_by_n_fn_vp9_t sad_8x8_c_vp9 = vp9_sad8x8_c;
-const sad_m_by_n_fn_vp9_t sad_8x4_c_vp9 = vp9_sad8x4_c;
-const sad_m_by_n_fn_vp9_t sad_4x8_c_vp9 = vp9_sad4x8_c;
-const sad_m_by_n_fn_vp9_t sad_4x4_c_vp9 = vp9_sad4x4_c;
-const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_c_vp9),
   make_tuple(32, 32, sad_32x32_c_vp9),
   make_tuple(16, 16, sad_16x16_c_vp9),
@@ -421,9 +334,11 @@
   make_tuple(8, 4, sad_8x4_c_vp9),
   make_tuple(4, 8, sad_4x8_c_vp9),
   make_tuple(4, 4, sad_4x4_c_vp9),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+#if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
@@ -460,8 +375,8 @@
 const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_armv6)));
-#endif  // CONFIG_VP8_ENCODER
-#endif  // HAVE_MEDIA
+#endif
+#endif
 
 #if HAVE_NEON
 #if CONFIG_VP8_ENCODER
@@ -476,8 +391,8 @@
                         make_tuple(16, 8, sad_16x8_neon),
                         make_tuple(8, 8, sad_8x8_neon),
                         make_tuple(4, 4, sad_4x4_neon)));
-#endif  // CONFIG_VP8_ENCODER
-#endif  // HAVE_NEON
+#endif
+#endif
 
 //------------------------------------------------------------------------------
 // x86 functions
@@ -488,39 +403,40 @@
 const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
 const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
 const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
+#endif
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
+const sad_m_by_n_fn_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
+const sad_m_by_n_fn_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
+const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
+const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
+#endif
+
 const sad_m_by_n_test_param_t mmx_tests[] = {
+#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_mmx),
   make_tuple(8, 16, sad_8x16_mmx),
   make_tuple(16, 8, sad_16x8_mmx),
   make_tuple(8, 8, sad_8x8_mmx),
   make_tuple(4, 4, sad_4x4_mmx),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // CONFIG_VP8_ENCODER
-
+#endif
 #if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_vp9_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const sad_m_by_n_fn_vp9_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const sad_m_by_n_fn_vp9_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const sad_m_by_n_fn_vp9_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const sad_m_by_n_fn_vp9_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const sad_m_by_n_test_param_vp9_t mmx_vp9_tests[] = {
   make_tuple(16, 16, sad_16x16_mmx_vp9),
   make_tuple(8, 16, sad_8x16_mmx_vp9),
   make_tuple(16, 8, sad_16x8_mmx_vp9),
   make_tuple(8, 8, sad_8x8_mmx_vp9),
   make_tuple(4, 4, sad_4x4_mmx_vp9),
+#endif
 };
-INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
-#endif  // CONFIG_VP9_ENCODER
-#endif  // HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
+#endif
 
 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const sad_m_by_n_fn_vp9_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
-const sad_m_by_n_fn_vp9_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
-INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
+const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
+const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
+INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_sse_vp9),
                         make_tuple(4, 8, sad_4x8_sse_vp9)));
 
@@ -540,30 +456,32 @@
 const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
 const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
+#endif
+#if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
+const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
+const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
+const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
+const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
+const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
+#endif
+#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
+#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_wmt),
   make_tuple(8, 16, sad_8x16_wmt),
   make_tuple(16, 8, sad_16x8_wmt),
   make_tuple(8, 8, sad_8x8_wmt),
   make_tuple(4, 4, sad_4x4_wmt),
-};
-INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
-#endif  // CONFIG_VP8_ENCODER
-
+#endif
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const sad_m_by_n_fn_vp9_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const sad_m_by_n_fn_vp9_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_sse2_vp9),
   make_tuple(64, 32, sad_64x32_sse2_vp9),
   make_tuple(32, 64, sad_32x64_sse2_vp9),
@@ -575,9 +493,13 @@
   make_tuple(8, 16, sad_8x16_sse2_vp9),
   make_tuple(8, 8, sad_8x8_sse2_vp9),
   make_tuple(8, 4, sad_8x4_sse2_vp9),
+#endif
+#endif
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+#if CONFIG_VP9_ENCODER
+#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -601,9 +523,9 @@
                         make_tuple(8, 16, sad_8x16x4d_sse2),
                         make_tuple(8, 8, sad_8x8x4d_sse2),
                         make_tuple(8, 4, sad_8x4x4d_sse2)));
-#endif  // CONFIG_USE_X86INC
-#endif  // CONFIG_VP9_ENCODER
-#endif  // HAVE_SSE2
+#endif
+#endif
+#endif
 
 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -618,8 +540,8 @@
                         make_tuple(8, 16, sad_8x16x4d_sse3),
                         make_tuple(8, 8, sad_8x8x4d_sse3),
                         make_tuple(4, 4, sad_4x4x4d_sse3)));
-#endif  // CONFIG_VP8_ENCODER
-#endif  // HAVE_SSE3
+#endif
+#endif
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
@@ -627,8 +549,8 @@
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
-#endif  // CONFIG_VP8_ENCODER
-#endif  // CONFIG_USE_X86INC
-#endif  // HAVE_SSSE3
+#endif
+#endif
+#endif
 
 }  // namespace
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index 6fdeb87..7201a67 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -12,4 +12,4 @@
 generation.
 
 Local Modifications:
-None.
\ No newline at end of file
+Removed unused declarations of kPathSeparatorString to have warning free build.
\ No newline at end of file
diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc
index a9a03b2..8d90627 100644
--- a/third_party/googletest/src/src/gtest-all.cc
+++ b/third_party/googletest/src/src/gtest-all.cc
@@ -7904,7 +7904,6 @@
 // of them.
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
-const char kPathSeparatorString[] = "\\";
 const char kAlternatePathSeparatorString[] = "/";
 # if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
@@ -7918,7 +7917,6 @@
 # endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
-const char kPathSeparatorString[] = "/";
 const char kCurrentDirectoryString[] = "./";
 #endif  // GTEST_OS_WINDOWS
 
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c300cde..95cb40a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -528,82 +528,82 @@
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad64x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad32x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad64x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad32x16/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad16x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad32x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad8x4/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad4x8/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
 specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
 
 add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
index e5d3cb5..0930e78 100644
--- a/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -16,7 +16,7 @@
 #include <tmmintrin.h>  // SSSE3
 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
 
-static void idct16_8col(__m128i *in) {
+static void idct16_8col(__m128i *in, int round) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -36,6 +36,8 @@
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
   __m128i v[16], u[16], s[16], t[16];
 
@@ -266,28 +268,80 @@
   t[15] = _mm_add_epi16(s[12], s[15]);
 
   // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
+  if (round == 1) {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
 
-  u[0] = _mm_sub_epi16(t[13], t[10]);
-  u[1] = _mm_add_epi16(t[13], t[10]);
-  u[2] = _mm_sub_epi16(t[12], t[11]);
-  u[3] = _mm_add_epi16(t[12], t[11]);
+    u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+    u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
 
-  s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-  s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
-  s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
-  s[14] = t[14];
-  s[15] = t[15];
+    v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+    v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+    v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+    v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+    v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+    v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+    v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+    v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+    u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+    u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+    u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+    u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+    u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+    u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+    u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+    u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+    u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+    u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+    u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+    u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+    u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+    u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+    u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+    u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+    s[10] = _mm_packs_epi32(u[0], u[1]);
+    s[13] = _mm_packs_epi32(u[2], u[3]);
+    s[11] = _mm_packs_epi32(u[4], u[5]);
+    s[12] = _mm_packs_epi32(u[6], u[7]);
+    s[14] = t[14];
+    s[15] = t[15];
+  } else {
+    s[0] = _mm_add_epi16(t[0], t[7]);
+    s[1] = _mm_add_epi16(t[1], t[6]);
+    s[2] = _mm_add_epi16(t[2], t[5]);
+    s[3] = _mm_add_epi16(t[3], t[4]);
+    s[4] = _mm_sub_epi16(t[3], t[4]);
+    s[5] = _mm_sub_epi16(t[2], t[5]);
+    s[6] = _mm_sub_epi16(t[1], t[6]);
+    s[7] = _mm_sub_epi16(t[0], t[7]);
+    s[8] = t[8];
+    s[9] = t[9];
+
+    u[0] = _mm_sub_epi16(t[13], t[10]);
+    u[1] = _mm_add_epi16(t[13], t[10]);
+    u[2] = _mm_sub_epi16(t[12], t[11]);
+    u[3] = _mm_add_epi16(t[12], t[11]);
+
+    s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+    s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+    s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
+    s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
+    s[14] = t[14];
+    s[15] = t[15];
+  }
 
   // stage 7
   in[0] = _mm_add_epi16(s[0], s[15]);
@@ -308,10 +362,10 @@
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-static void idct16_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
   array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
+  idct16_8col(in0, round);
+  idct16_8col(in1, round);
 }
 
 void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
@@ -322,8 +376,8 @@
   input += 8;
   load_buffer_8x16(input, in1);
 
-  idct16_sse2(in0, in1);
-  idct16_sse2(in0, in1);
+  idct16_sse2(in0, in1, 0);
+  idct16_sse2(in0, in1, 1);
 
   write_buffer_8x16(dest, in0, stride);
   dest += 8;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5d3735e..2e4cd86 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1291,14 +1291,14 @@
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
     this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride);
+                                            pre, pre_stride, 0x7fffffff);
     threshold = (1 << 12);
   } else {
     int r, c;
     for (r = 0; r < row8x8_remaining; r += 2)
       for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
-                                                 pre, pre_stride);
+        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, pre,
+                                                 pre_stride, 0x7fffffff);
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 842bc5b..041e583 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -72,7 +72,8 @@
   x->mv_row_max = tmp_row_max;
 
   return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+          INT_MAX);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
@@ -85,7 +86,8 @@
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -121,7 +123,8 @@
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
 
   dst_mv->as_int = 0;
 
@@ -144,7 +147,7 @@
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
 
     // find best
     if (err < best_err) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9d2b2a4..4f7d6f1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,7 +524,9 @@
 
   // Work out the start point for the search
   bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride);
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                     0x7fffffff) + mvsad_err_cost(x, ref_mv, &fcenter_mv,
+                                                  sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -540,7 +542,7 @@
                               bc + candidates[t][i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
@@ -551,7 +553,7 @@
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       }
@@ -583,7 +585,7 @@
                                 bc + candidates[s][i].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
@@ -594,7 +596,7 @@
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -621,7 +623,7 @@
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
@@ -632,7 +634,7 @@
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -659,7 +661,7 @@
                               bc + neighbors[i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
@@ -670,7 +672,7 @@
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       }
@@ -892,7 +894,8 @@
   *best_mv = *ref_mv;
   *num00 = 11;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                         0x7fffffff) +
                  mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   start_row = MAX(-range, x->mv_row_min - ref_mv->row);
   start_col = MAX(-range, x->mv_col_min - ref_mv->col);
@@ -926,7 +929,7 @@
         for (i = 0; i < end_col - c; ++i) {
           const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride);
+              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -972,7 +975,7 @@
 
   // Check the starting position
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         best_address, in_what->stride) +
+                         best_address, in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -983,7 +986,8 @@
                      best_mv->col + ss[i].mv.col};
       if (is_mv_in(x, &mv)) {
        int sad = fn_ptr->sdf(what->buf, what->stride,
-                             best_address + ss[i].offset, in_what->stride);
+                             best_address + ss[i].offset, in_what->stride,
+                             best_sad);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
           if (sad < best_sad) {
@@ -1008,7 +1012,7 @@
         if (is_mv_in(x, &this_mv)) {
           int sad = fn_ptr->sdf(what->buf, what->stride,
                                 best_address + ss[best_site].offset,
-                                in_what->stride);
+                                in_what->stride, best_sad);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -1073,7 +1077,7 @@
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -1125,7 +1129,7 @@
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[i].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride);
+                                             in_what_stride, bestsad);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -1150,7 +1154,7 @@
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[best_site].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride);
+                                             in_what_stride, bestsad);
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -1249,7 +1253,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1257,7 +1261,7 @@
     for (c = col_min; c < col_max; ++c) {
       const MV mv = {r, c};
       const int sad = fn_ptr->sdf(what->buf, what->stride,
-          get_buf_from_mv(in_what, &mv), in_what->stride) +
+          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
               mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
       if (sad < best_sad) {
         best_sad = sad;
@@ -1282,7 +1286,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1316,7 +1320,7 @@
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride);
+                                     check_here, in_what->stride, best_sad);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1347,7 +1351,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1405,7 +1409,7 @@
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride);
+                                     check_here, in_what->stride, best_sad);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1434,7 +1438,7 @@
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, ref_mv),
-                                     in_what->stride) +
+                                     in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1446,7 +1450,7 @@
                      ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride);
+            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -1479,7 +1483,7 @@
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
-                                    in_what->stride) +
+                                    in_what->stride, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1520,7 +1524,7 @@
         if (is_mv_in(x, &mv)) {
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
                                          get_buf_from_mv(in_what, &mv),
-                                         in_what->stride);
+                                         in_what->stride, best_sad);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -1559,7 +1563,8 @@
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride,
+      second_pred, 0x7fffffff) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1572,7 +1577,8 @@
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
+            get_buf_from_mv(in_what, &mv), in_what->stride,
+            second_pred, best_sad);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 11633a7..913b8ea 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -280,8 +280,7 @@
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
-      if (cpi->sf.disable_inter_mode_mask[bsize] &
-          (1 << INTER_OFFSET(this_mode)))
+      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
       if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]],
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d402d7b..1557c8d 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1675,9 +1675,9 @@
 static int check_best_zero_mv(
     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int disable_inter_mode_mask, int this_mode,
+    int inter_mode_mask, int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2]) {
-  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+  if ((inter_mode_mask & (1 << ZEROMV)) &&
       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
@@ -1743,7 +1743,7 @@
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
-  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
 
   vp9_zero(*bsi);
 
@@ -1792,11 +1792,11 @@
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
-        if (disable_inter_mode_mask & (1 << mode_idx))
+        if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask,
+                                inter_mode_mask,
                                 this_mode, mbmi->ref_frame))
           continue;
 
@@ -2129,7 +2129,8 @@
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
-                                           ref_y_ptr, ref_y_stride);
+                                           ref_y_ptr, ref_y_stride,
+                                           0x7fffffff);
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -3063,7 +3064,7 @@
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   const int intra_y_mode_mask =
       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
   vp9_zero(best_mbmode);
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3130,7 +3131,7 @@
     const int inter_non_zero_mode_mask = 0x1F7F7;
     mode_skip_mask |= inter_non_zero_mode_mask;
     mode_skip_mask &= ~(1 << THR_ZEROMV);
-    disable_inter_mode_mask = ~(1 << INTER_OFFSET(ZEROMV));
+    inter_mode_mask = (1 << ZEROMV);
   }
 
   // Disable this drop out case if the ref frame
@@ -3182,7 +3183,8 @@
       mode_index = THR_ZEROMV;
     mode_skip_mask = ~(1 << mode_index);
     mode_skip_start = MAX_MODES;
-    disable_inter_mode_mask = 0;
+    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+                      (1 << NEWMV);
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3229,8 +3231,7 @@
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME &&
-        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
+    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
       continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
@@ -3279,7 +3280,7 @@
           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                disable_inter_mode_mask, this_mode, ref_frames))
+                                inter_mode_mask, this_mode, ref_frames))
           continue;
       }
     }
@@ -3665,7 +3666,6 @@
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
-  int ref_frame_mask = 0;
   int mode_skip_mask = 0;
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3700,17 +3700,6 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME;
-       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
-    int i;
-    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
-        ref_frame_mask |= (1 << ref_frame);
-        break;
-      }
-    }
-  }
-
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index d062636..892e905 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -35,12 +35,14 @@
 
 #define sadMxN(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride) { \
+                                  const uint8_t *ref, int ref_stride, \
+                                  unsigned int max_sad) { \
   return sad(src, src_stride, ref, ref_stride, m, n); \
 } \
 unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred) { \
+                                      const uint8_t *second_pred, \
+                                      unsigned int max_sad) { \
   uint8_t comp_pred[m * n]; \
   vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
@@ -52,7 +54,8 @@
                                 unsigned int *sads) { \
   int i; \
   for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride, \
+                                   0x7fffffff); \
 }
 
 #define sadMxNx4D(m, n) \
@@ -61,7 +64,8 @@
                              unsigned int *sads) { \
   int i; \
   for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride, \
+                                   0x7fffffff); \
 }
 
 // 64x64
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4c3d34d..15b9861 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -28,6 +28,12 @@
 };
 
 enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
+};
+
+enum {
   DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
                               (1 << THR_COMP_LA) |
                               (1 << THR_ALTR) |
@@ -234,10 +240,10 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->search_method = FAST_HEX;
-    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
-    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
-    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
     sf->max_intra_bsize = BLOCK_32X32;
     sf->allow_skip_recode = 1;
   }
@@ -268,7 +274,7 @@
   if (speed >= 7) {
     int i;
     for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+      sf->inter_mode_mask[i] = INTER_NEAREST;
   }
 }
 
@@ -326,7 +332,7 @@
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->use_nonrd_pick_mode = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
-    sf->disable_inter_mode_mask[i] = 0;
+    sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index a54599e..3e7cd27 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -331,8 +331,8 @@
   int use_nonrd_pick_mode;
 
   // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
-  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
-  int disable_inter_mode_mask[BLOCK_SIZES];
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
 
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4a194b7..c47fe13 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -25,13 +25,15 @@
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
-                                    int ref_stride);
+                                    int ref_stride,
+                                    unsigned int max_sad);
 
 typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
                                         int source_stride,
                                         const uint8_t *ref_ptr,
                                         int ref_stride,
-                                        const uint8_t *second_pred);
+                                        const uint8_t *second_pred,
+                                        unsigned int max_sad);
 
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 6e56c84..48110b4 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -38,7 +38,6 @@
   vpx_decrypt_cb          decrypt_cb;
   void                   *decrypt_state;
   vpx_image_t             img;
-  int                     img_avail;
   int                     invert_tile_order;
 
   // External frame buffer info to save for VP9 common.
@@ -245,15 +244,11 @@
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0, 0, 0};
+  vp9_ppflags_t flags = {0};
   VP9_COMMON *cm = NULL;
 
   (void)deadline;
 
-  vp9_zero(sd);
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -288,13 +283,6 @@
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
     set_ppflags(ctx, &flags);
 
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
   return VPX_CODEC_OK;
 }
 
@@ -423,15 +411,20 @@
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = {0, 0, 0};
+
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *cm = &ctx->pbi->common;
+      yuvconfig2image(&ctx->img, &sd, NULL);
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       img = &ctx->img;
       *iter = img;
     }
   }
-  ctx->img_avail = 0;
 
   return img;
 }
diff --git a/vpx/exports_enc b/vpx/exports_enc
index 155faf6..07f0280 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -8,7 +8,6 @@
 text vpx_codec_set_cx_data_buf
 text vpx_svc_dump_statistics
 text vpx_svc_encode
-text vpx_svc_free
 text vpx_svc_get_buffer
 text vpx_svc_get_encode_frame_count
 text vpx_svc_get_frame_size
@@ -22,4 +21,4 @@
 text vpx_svc_set_scale_factors
 text vpx_svc_get_layer_resolution
 text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer
\ No newline at end of file
+text vpx_svc_get_rc_stats_buffer