Merge "Re-factor and simplify arnr filter."

commit: af4f390fff466f27e2b9f9f9e3d524e0f041e00b [log] [tgz]
author: Adrian Grange <agrange@google.com> Fri Jul 11 10:52:09 2014 -0700
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> Fri Jul 11 10:52:09 2014 -0700
tree: 077d7d5deb0035b02ac26f8e4e43870961cab18c
parent: a75d55df1b362cb608f1ea6b70154cf166c32a3f [diff]
parent: e3e6e06155e3bdefac9775b6b50d84aaf1aca06c [diff]
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 2f88b53..1b9c943 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc

@@ -7,8 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <climits>
-#include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -17,11 +15,12 @@
 
 namespace {
 
-class AqSegmentTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWith2Params<
-        libvpx_test::TestMode, int> {
+class AqSegmentTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~AqSegmentTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -39,10 +38,6 @@
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
-    }
-  }
   int set_cpu_used_;
   int aq_mode_;
 };
@@ -107,13 +102,8 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-using std::tr1::make_tuple;
-
-#define VP9_FACTORY \
-  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
-
 VP9_INSTANTIATE_TEST_CASE(AqSegmentTest,
                           ::testing::Values(::libvpx_test::kRealTime,
                                             ::libvpx_test::kOnePassGood),
-                                            ::testing::Range(3, 9));
+                          ::testing::Range(3, 9));
 }  // namespace

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 6af2abb..2fcb2df 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -264,7 +264,7 @@
   uint8_t* const out = output();
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
-  REGISTER_STATE_CHECK(
+  ASM_REGISTER_STATE_CHECK(
       UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
                 Width(), Height()));
 
@@ -281,7 +281,7 @@
   uint8_t* const out = output();
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
-  REGISTER_STATE_CHECK(
+  ASM_REGISTER_STATE_CHECK(
       UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
                 Width(), Height()));
 
@@ -298,7 +298,7 @@
   uint8_t* const out = output();
   DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
 
-  REGISTER_STATE_CHECK(
+  ASM_REGISTER_STATE_CHECK(
       UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
                  Width(), Height()));
 
@@ -356,17 +356,17 @@
                            Width(), Height());
 
         if (filters == eighttap_smooth || (filter_x && filter_y))
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->hv8_(in, kInputStride, out, kOutputStride,
                          filters[filter_x], 16, filters[filter_y], 16,
                          Width(), Height()));
         else if (filter_y)
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->v8_(in, kInputStride, out, kOutputStride,
                         kInvalidFilter, 16, filters[filter_y], 16,
                         Width(), Height()));
         else
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->h8_(in, kInputStride, out, kOutputStride,
                         filters[filter_x], 16, kInvalidFilter, 16,
                         Width(), Height()));
@@ -414,17 +414,17 @@
                                    Width(), Height());
 
         if (filters == eighttap_smooth || (filter_x && filter_y))
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,
                              filters[filter_x], 16, filters[filter_y], 16,
                              Width(), Height()));
         else if (filter_y)
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->v8_avg_(in, kInputStride, out, kOutputStride,
                             filters[filter_x], 16, filters[filter_y], 16,
                             Width(), Height()));
         else
-          REGISTER_STATE_CHECK(
+          ASM_REGISTER_STATE_CHECK(
               UUT_->h8_avg_(in, kInputStride, out, kOutputStride,
                             filters[filter_x], 16, filters[filter_y], 16,
                             Width(), Height()));
@@ -494,9 +494,10 @@
    */
 
   /* Test the horizontal filter. */
-  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, NULL, 0, Width(), Height()));
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->h8_(in, kInputStride, out, kOutputStride,
+                kChangeFilters[kInitialSubPelOffset],
+                kInputPixelStep, NULL, 0, Width(), Height()));
 
   for (int x = 0; x < Width(); ++x) {
     const int kFilterPeriodAdjust = (x >> 3) << 3;
@@ -508,9 +509,10 @@
   }
 
   /* Test the vertical filter. */
-  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
-                                 kInputPixelStep, Width(), Height()));
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->v8_(in, kInputStride, out, kOutputStride,
+                NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                kInputPixelStep, Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
     const int kFilterPeriodAdjust = (y >> 3) << 3;
@@ -522,12 +524,11 @@
   }
 
   /* Test the horizontal and vertical filters in combination. */
-  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
-                                  kChangeFilters[kInitialSubPelOffset],
-                                  kInputPixelStep,
-                                  Width(), Height()));
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
+                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
+                 Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
     const int kFilterPeriodAdjustY = (y >> 3) << 3;
@@ -560,10 +561,10 @@
   for (int frac = 0; frac < 16; ++frac) {
     for (int step = 1; step <= 32; ++step) {
       /* Test the horizontal and vertical filters in combination. */
-      REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                      eighttap[frac], step,
-                                      eighttap[frac], step,
-                                      Width(), Height()));
+      ASM_REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                                          eighttap[frac], step,
+                                          eighttap[frac], step,
+                                          Width(), Height()));
 
       CheckGuardBlocks();
 

diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 961a0b8..9dca601 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc

@@ -7,8 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <climits>
-#include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -20,9 +18,9 @@
 
 const int kMaxPSNR = 100;
 
-class CpuSpeedTest : public ::libvpx_test::EncoderTest,
-    public ::libvpx_test::CodecTestWith2Params<
-        libvpx_test::TestMode, int> {
+class CpuSpeedTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   CpuSpeedTest()
       : EncoderTest(GET_PARAM(0)),
@@ -60,11 +58,6 @@
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
-    }
-  }
-
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
     if (pkt->data.psnr.psnr[0] < min_psnr_)
       min_psnr_ = pkt->data.psnr.psnr[0];
@@ -126,11 +119,11 @@
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
+
 TEST_P(CpuSpeedTest, TestLowBitrate) {
   // Validate that this clip encodes and decodes without a mismatch
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
-
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_minsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
@@ -142,11 +135,6 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-using std::tr1::make_tuple;
-
-#define VP9_FACTORY \
-  static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
-
 VP9_INSTANTIATE_TEST_CASE(
     CpuSpeedTest,
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index e6a20fb..b7c2dcc 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -311,9 +311,9 @@
         test_input_block[j] = src[j] - dst[j];
       }
 
-      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                      test_temp_block, pitch_));
-      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];
@@ -344,7 +344,7 @@
         input_block[j] = rnd.Rand8() - rnd.Rand8();
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
 
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j)
@@ -375,8 +375,8 @@
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                      output_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
 
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
@@ -421,7 +421,7 @@
       for (int j = 1; j < kNumCoeffs; ++j)
         output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
       inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
 
       for (int j = 0; j < kNumCoeffs; ++j)
         EXPECT_EQ(ref[j], dst[j]);
@@ -450,7 +450,7 @@
       for (int j = 0; j < kNumCoeffs; ++j)
         coeff[j] = round(out_r[j]);
 
-      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 501c696..02250a9 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -112,8 +112,8 @@
       test_input_block[j] = src[j] - dst[j];
     }
 
-    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
-    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
+    ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
 
     for (int j = 0; j < kNumCoeffs; ++j) {
       const uint32_t diff = dst[j] - src[j];
@@ -150,7 +150,7 @@
 
     const int stride = 32;
     vp9_fdct32x32_c(input_block, output_ref_block, stride);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
 
     if (version_ == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
@@ -189,7 +189,8 @@
 
     const int stride = 32;
     vp9_fdct32x32_c(input_extreme_block, output_ref_block, stride);
-    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, stride));
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_extreme_block, output_block, stride));
 
     // The minimum quant value is 4.
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -230,7 +231,7 @@
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < kNumCoeffs; ++j)
       coeff[j] = round(out_r[j]);
-    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
     for (int j = 0; j < kNumCoeffs; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 8bea4cc..187a1c7 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -32,7 +32,7 @@
                                      void *user_priv) {
   vpx_codec_err_t res_dec;
   InitOnce();
-  REGISTER_STATE_CHECK(
+  API_REGISTER_STATE_CHECK(
       res_dec = vpx_codec_decode(&decoder_,
                                  cxdata, static_cast<unsigned int>(size),
                                  user_priv, 0));

diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 28e29c6..6cb7d0e 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh

@@ -39,7 +39,8 @@
     return 1
   fi
 
-  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
 
   [ -e "${output_file}" ] || return 1
 

diff --git a/test/decode_with_drops.sh b/test/decode_with_drops.sh
index 12e17de..9b2edb6 100755
--- a/test/decode_with_drops.sh
+++ b/test/decode_with_drops.sh

@@ -39,7 +39,8 @@
     return 1
   fi
 
-  eval "${decoder}" "${input_file}" "${output_file}" "${drop_mode}" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      "${drop_mode}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 709831e..b30c3e0 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -59,7 +59,7 @@
   }
 
   // Encode the frame
-  REGISTER_STATE_CHECK(
+  API_REGISTER_STATE_CHECK(
       res = vpx_codec_encode(&encoder_,
                              video.img(), video.pts(), video.duration(),
                              frame_flags, deadline_));

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index ec233d3..ab636bf 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -79,9 +79,9 @@
         test_input_block[j] = src[j] - dst[j];
       }
 
-      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                      test_temp_block, pitch_));
-      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];
@@ -114,7 +114,7 @@
         input_block[j] = rnd.Rand8() - rnd.Rand8();
 
       fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
 
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j)
@@ -145,8 +145,8 @@
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                      output_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
 
       // The minimum quant value is 4.
       for (int j = 0; j < kNumCoeffs; ++j) {
@@ -175,7 +175,7 @@
 
       fwd_txfm_ref(in, coeff, pitch_, tx_type_);
 
-      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
         const uint32_t diff = dst[j] - src[j];

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 146aa31..3e14c24 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -68,7 +68,7 @@
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < 64; ++j)
         test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_output_block, pitch_));
 
       for (int j = 0; j < 64; ++j) {
@@ -97,7 +97,7 @@
       // Initialize a test block with input range [-15, 15].
       for (int j = 0; j < 64; ++j)
         test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_output_block, pitch_));
 
       for (int j = 0; j < 64; ++j) {
@@ -139,7 +139,7 @@
         test_input_block[j] = src[j] - dst[j];
       }
 
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       for (int j = 0; j < 64; ++j) {
           if (test_temp_block[j] > 0) {
@@ -152,7 +152,7 @@
             test_temp_block[j] *= 4;
           }
       }
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunInvTxfm(test_temp_block, dst, pitch_));
 
       for (int j = 0; j < 64; ++j) {
@@ -202,11 +202,11 @@
         test_input_block[j] = src[j] - dst[j];
       }
 
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           RunInvTxfm(test_temp_block, dst, pitch_));
 
       for (int j = 0; j < 64; ++j) {

diff --git a/test/idct_test.cc b/test/idct_test.cc
index 1bbf80a..c7f609d 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc

@@ -52,7 +52,7 @@
 TEST_P(IDCTTest, TestAllZeros) {
   int i;
 
-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
   for (i = 0; i < 256; i++)
     if ((i & 0xF) < 4 && i < 64)
@@ -65,7 +65,7 @@
   int i;
 
   input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
   for (i = 0; i < 256; i++)
     if ((i & 0xF) < 4 && i < 64)
@@ -79,7 +79,7 @@
 
   for (i = 0; i < 256; i++) predict[i] = i;
   input[0] = 4;
-  REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
 
   for (i = 0; i < 256; i++)
     if ((i & 0xF) < 4 && i < 64)
@@ -93,7 +93,7 @@
 
   for (i = 0; i < 16; i++) input[i] = i;
 
-  REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
   for (i = 0; i < 256; i++)
     if ((i & 0xF) > 3 || i > 63)

diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index cefe192..d93251d 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc

@@ -261,10 +261,10 @@
 
   virtual void Predict(MB_PREDICTION_MODE mode) {
     mbptr_->mode_info_context->mbmi.mode = mode;
-    REGISTER_STATE_CHECK(pred_fn_(mbptr_,
-                                  data_ptr_[0] - kStride,
-                                  data_ptr_[0] - 1, kStride,
-                                  data_ptr_[0], kStride));
+    ASM_REGISTER_STATE_CHECK(pred_fn_(mbptr_,
+                                      data_ptr_[0] - kStride,
+                                      data_ptr_[0] - 1, kStride,
+                                      data_ptr_[0], kStride));
   }
 
   intra_pred_y_fn_t pred_fn_;

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 8c83034..63c7941 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -55,7 +55,8 @@
 
     // Check results match.
     EXPECT_EQ(expected_res_dec, res_dec)
-        << "Results don't match: frame number = " << video.frame_number();
+        << "Results don't match: frame number = " << video.frame_number()
+        << ". (" << decoder->DecodeError() << ")";
 
     return !HasFailure();
   }

diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index f2171b2..fd24c75 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc

@@ -118,8 +118,8 @@
                          = (output_ref_block[j] / 1828) * 1828;
     }
 
-    REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
 
     for (int j = 0; j < block_size; ++j) {
       const int diff = dst1[j] - dst2[j];
@@ -182,8 +182,8 @@
     memcpy(test_coef_block2, test_coef_block1,
            sizeof(*test_coef_block2) * block_size);
 
-    REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
 
     for (int j = 0; j < block_size; ++j) {
       const int diff = dst1[j] - dst2[j];

diff --git a/test/postproc.sh b/test/postproc.sh
index c9c4e58..939a3e7 100755
--- a/test/postproc.sh
+++ b/test/postproc.sh

@@ -37,7 +37,8 @@
     return 1
   fi
 
-  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }

diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 86c2b0e..1144083 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc

@@ -80,8 +80,9 @@
   // Initialize pixels in the output to 99.
   (void)vpx_memset(dst_image, 99, output_size);
 
-  REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, input_stride,
-                                  output_stride, block_width, flimits, 16));
+  ASM_REGISTER_STATE_CHECK(
+      GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+                 output_stride, block_width, flimits, 16));
 
   static const uint8_t expected_data[block_height] = {
     4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4

diff --git a/test/register_state_check.h b/test/register_state_check.h
index 1ee149b..8d4beea 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h

@@ -13,6 +13,20 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// ASM_REGISTER_STATE_CHECK(asm_function)
+//   Minimally validates the environment pre & post function execution. This
+//   variant should be used with assembly functions which are not expected to
+//   fully restore the system state. See platform implementations of
+//   RegisterStateCheck for details.
+//
+// API_REGISTER_STATE_CHECK(api_function)
+//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
+//   additional checks to ensure the environment is in a consistent state pre &
+//   post function execution. This variant should be used with API functions.
+//   See platform implementations of RegisterStateCheckXXX for details.
+//
 
 #if defined(_WIN64)
 
@@ -35,11 +49,6 @@
 // Compares the state of xmm[6-15] at construction with their state at
 // destruction. These registers should be preserved by the callee on
 // Windows x64.
-// Usage:
-// {
-//   RegisterStateCheck reg_check;
-//   FunctionToVerify();
-// }
 class RegisterStateCheck {
  public:
   RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
@@ -75,9 +84,9 @@
   CONTEXT pre_context_;
 };
 
-#define REGISTER_STATE_CHECK(statement) do { \
-  libvpx_test::RegisterStateCheck reg_check; \
-  statement;                               \
+#define ASM_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheck reg_check;      \
+  statement;                                      \
 } while (false)
 
 }  // namespace libvpx_test
@@ -85,8 +94,6 @@
 #elif defined(CONFIG_SHARED) && defined(HAVE_NEON_ASM) && defined(CONFIG_VP9) \
       && !CONFIG_SHARED && HAVE_NEON_ASM && CONFIG_VP9
 
-#include "vpx/vpx_integer.h"
-
 extern "C" {
 // Save the d8-d15 registers into store.
 void vp9_push_neon(int64_t *store);
@@ -97,11 +104,6 @@
 // Compares the state of d8-d15 at construction with their state at
 // destruction. These registers should be preserved by the callee on
 // arm platform.
-// Usage:
-// {
-//   RegisterStateCheck reg_check;
-//   FunctionToVerify();
-// }
 class RegisterStateCheck {
  public:
   RegisterStateCheck() { initialized_ = StoreRegisters(pre_store_); }
@@ -129,9 +131,9 @@
   int64_t pre_store_[8];
 };
 
-#define REGISTER_STATE_CHECK(statement) do { \
-  libvpx_test::RegisterStateCheck reg_check; \
-  statement;                               \
+#define ASM_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheck reg_check;      \
+  statement;                                      \
 } while (false)
 
 }  // namespace libvpx_test
@@ -141,10 +143,54 @@
 namespace libvpx_test {
 
 class RegisterStateCheck {};
-#define REGISTER_STATE_CHECK(statement) statement
+#define ASM_REGISTER_STATE_CHECK(statement) statement
 
 }  // namespace libvpx_test
 
 #endif  // _WIN64
 
+#if ARCH_X86 || ARCH_X86_64
+#if defined(__GNUC__)
+
+namespace libvpx_test {
+
+// Checks the FPU tag word pre/post execution to ensure emms has been called.
+class RegisterStateCheckMMX {
+ public:
+  RegisterStateCheckMMX() {
+    __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
+  }
+  ~RegisterStateCheckMMX() { EXPECT_TRUE(Check()); }
+
+ private:
+  // Checks the FPU tag word pre/post execution, returning false if not cleared
+  // to 0xffff.
+  bool Check() const {
+    EXPECT_EQ(0xffff, pre_fpu_env_[4])
+        << "FPU was in an inconsistent state prior to call";
+
+    uint16_t post_fpu_env[14];
+    __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
+    EXPECT_EQ(0xffff, post_fpu_env[4])
+        << "FPU was left in an inconsistent state after call";
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  uint16_t pre_fpu_env_[14];
+};
+
+#define API_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheckMMX reg_check;   \
+  ASM_REGISTER_STATE_CHECK(statement);            \
+} while (false)
+
+}  // namespace libvpx_test
+
+#endif  // __GNUC__
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#ifndef API_REGISTER_STATE_CHECK
+#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
+#endif
+
 #endif  // TEST_REGISTER_STATE_CHECK_H_

diff --git a/test/resize_util.sh b/test/resize_util.sh
index ab3dfd1..5e47271 100755
--- a/test/resize_util.sh
+++ b/test/resize_util.sh

@@ -38,7 +38,7 @@
       return 1
     fi
 
-    eval "${resizer}" "${YUV_RAW_INPUT}" \
+    eval "${VPX_TEST_PREFIX}" "${resizer}" "${YUV_RAW_INPUT}" \
         "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
         "${target_dimensions}" "${output_file}" ${frames_to_resize} \
         ${devnull}

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 89d8c41..7e5bcef 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -149,9 +149,9 @@
     const uint8_t* refs[] = {GetReference(0), GetReference(1),
                              GetReference(2), GetReference(3)};
 
-    REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
-                                      refs, reference_stride_,
-                                      results));
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+                                          refs, reference_stride_,
+                                          results));
   }
 
   void CheckSADs() {
@@ -178,9 +178,9 @@
     unsigned int ret;
     const uint8_t* const reference = GetReference(block_idx);
 
-    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                            reference, reference_stride_,
-                                            max_sad));
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                max_sad));
     return ret;
   }
 
@@ -210,8 +210,8 @@
     unsigned int ret;
     const uint8_t* const reference = GetReference(block_idx);
 
-    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                            reference, reference_stride_));
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
     return ret;
   }
 

diff --git a/test/simple_decoder.sh b/test/simple_decoder.sh
index 0be48e6..7eeaf71 100755
--- a/test/simple_decoder.sh
+++ b/test/simple_decoder.sh

@@ -37,7 +37,8 @@
     return 1
   fi
 
-  eval "${decoder}" "${input_file}" "${output_file}" ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }

diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index a0b0e13..c4a6280 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh

@@ -34,7 +34,7 @@
     return 1
   fi
 
-  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
       ${devnull}
 

diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 0c600f4..ac7aa99 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc

@@ -143,8 +143,9 @@
 
   uint8_t *src = const_cast<uint8_t*>(test_data);
 
-  REGISTER_STATE_CHECK(sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
-                                       2, 2, dst_, kDstStride));
+  ASM_REGISTER_STATE_CHECK(
+      sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                      2, 2, dst_, kDstStride));
 
   for (int i = 0; i < height_; ++i)
     for (int j = 0; j < width_; ++j)
@@ -169,7 +170,7 @@
                                 xoffset, yoffset, dst_c_, kDstStride);
 
       // Run test.
-      REGISTER_STATE_CHECK(
+      ASM_REGISTER_STATE_CHECK(
           sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
                           xoffset, yoffset, dst_, kDstStride));
 

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 63e999d..2db3dd7 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -82,7 +82,7 @@
       predictor += kDiffPredStride;
     }
 
-    REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
+    ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
 
     base_src = *be.base_src;
     src_diff = be.src_diff;

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index f9c09c6..8940027 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -653,6 +653,8 @@
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
+698a6910a97486b833073ef0c0b18d75dce57ee8  vp90-2-16-intra-only.webm
+5661b0168752969f055eec37b05fa9fa947dc7eb  vp90-2-16-intra-only.webm.md5
 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
 367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
 76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf

diff --git a/test/test.mk b/test/test.mk
index 85212d9..ef81ab1 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -771,6 +771,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
 

diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 41c9e26..4955887 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc

@@ -180,6 +180,7 @@
   "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
+  "vp90-2-16-intra-only.webm",
   "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);

diff --git a/test/tools_common.sh b/test/tools_common.sh
index 7f32905..3e69c36 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh

@@ -16,6 +16,7 @@
 
 set -e
 devnull='> /dev/null 2>&1'
+VPX_TEST_PREFIX=""
 
 elog() {
   echo "$@" 1>&2
@@ -204,9 +205,12 @@
   local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
 
   if [ -z "${pipe_input}" ]; then
-    eval "${decoder}" "$input" --summary --noblit "$@" ${devnull}
+    eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" --summary --noblit "$@" \
+        ${devnull}
   else
-    cat "${input}" | eval "${decoder}" - --summary --noblit "$@" ${devnull}
+    cat "${input}" \
+        | eval "${VPX_TEST_PREFIX}" "${decoder}" - --summary --noblit "$@" \
+            ${devnull}
   fi
 }
 
@@ -252,16 +256,14 @@
   fi
 
   if [ -z "${pipe_input}" ]; then
-    eval "${encoder}" --codec=${codec} --width=${width} --height=${height} \
-        --limit=${frames} ${use_ivf} ${extra_flags} --output="${output}" \
-        "${input}" \
-        ${devnull}
+    eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} --width=${width} \
+        --height=${height} --limit=${frames} ${use_ivf} ${extra_flags} \
+        --output="${output}" "${input}" ${devnull}
   else
     cat "${input}" \
-        | eval "${encoder}" --codec=${codec} --width=${width} \
-              --height=${height} --limit=${frames} ${use_ivf} ${extra_flags} \
-              --output="${output}" - \
-              ${devnull}
+        | eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} \
+            --width=${width} --height=${height} --limit=${frames} ${use_ivf} \
+            ${extra_flags} --output="${output}" - ${devnull}
   fi
 
   if [ ! -e "${output}" ]; then
@@ -351,6 +353,9 @@
     --help: Display this message and exit.
     --test-data-path <path to libvpx test data directory>
     --show-program-output: Shows output from all programs being tested.
+    --prefix: Allows for a user specified prefix to be inserted before all test
+              programs. Grants the ability, for example, to run test programs
+              within valgrind.
     --verbose: Verbose output.
 
     When the --bin-path option is not specified the script attempts to use
@@ -400,6 +405,10 @@
       LIBVPX_TEST_DATA_PATH="$2"
       shift
       ;;
+    --prefix)
+      VPX_TEST_PREFIX="$2"
+      shift
+      ;;
     --verbose)
       VPX_TEST_VERBOSE_OUTPUT=yes
       ;;
@@ -466,6 +475,7 @@
   VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX}
   VPX_TEST_FILTER=${VPX_TEST_FILTER}
   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
+  VPX_TEST_PREFIX=${VPX_TEST_PREFIX}
   VPX_TEST_RAND=${VPX_TEST_RAND}
   VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
   VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}

diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 95d49d6..1189e51 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh

@@ -34,7 +34,7 @@
     return 1
   fi
 
-  eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
       ${devnull}
 

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9985695..5469770 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -124,7 +124,8 @@
       memset(ref_, j, block_size_);
       unsigned int sse;
       unsigned int var;
-      REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
+      ASM_REGISTER_STATE_CHECK(
+          var = variance_(src_, width_, ref_, width_, &sse));
       EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
     }
   }
@@ -139,7 +140,8 @@
     }
     unsigned int sse1, sse2;
     unsigned int var1;
-    REGISTER_STATE_CHECK(var1 = variance_(src_, width_, ref_, width_, &sse1));
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_, ref_, width_, &sse1));
     const unsigned int var2 = variance_ref(src_, ref_, log2width_,
                                            log2height_, &sse2);
     EXPECT_EQ(sse1, sse2);
@@ -155,7 +157,7 @@
   memset(ref_ + half, 0, half);
   unsigned int sse;
   unsigned int var;
-  REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
+  ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
   const unsigned int expected = block_size_ * 255 * 255 / 4;
   EXPECT_EQ(expected, var);
 }
@@ -246,8 +248,8 @@
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
-                                                   src_, width_, &sse1));
+      ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                       src_, width_, &sse1));
       const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
                                                     log2height_, x, y, &sse2);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
@@ -269,8 +271,9 @@
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
-                                                   src_, width_, &sse1, sec_));
+      ASM_REGISTER_STATE_CHECK(
+          var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                  src_, width_, &sse1, sec_));
       const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
                                                         log2width_, log2height_,
                                                         x, y, &sse2);

diff --git a/test/vp8cx_set_ref.sh b/test/vp8cx_set_ref.sh
index ee10056..5d760bc 100755
--- a/test/vp8cx_set_ref.sh
+++ b/test/vp8cx_set_ref.sh

@@ -39,9 +39,9 @@
     return 1
   fi
 
-  eval "${encoder}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-      "${YUV_RAW_INPUT}" "${output_file}" "${ref_frame_num}" \
-      ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${ref_frame_num}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }

diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh
index 8c9d130..a5728f6 100755
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh

@@ -39,10 +39,9 @@
     return 1
   fi
 
-  eval "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" -h "${YUV_RAW_INPUT_HEIGHT}" \
-      -k "${max_kf}" -f "${frames_to_encode}" "$@" "${YUV_RAW_INPUT}" \
-      "${output_file}" \
-      ${devnull}
+  eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
+      -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
+      "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 72719a6..d7fc4ee 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc

@@ -18,7 +18,7 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vp9/decoder/vp9_thread.h"
+#include "vp9/common/vp9_thread.h"
 
 namespace {
 
@@ -35,6 +35,15 @@
     vp9_get_worker_interface()->end(&worker_);
   }
 
+  void Run(VP9Worker* worker) {
+    const bool synchronous = GetParam();
+    if (synchronous) {
+      vp9_get_worker_interface()->execute(worker);
+    } else {
+      vp9_get_worker_interface()->launch(worker);
+    }
+  }
+
   VP9Worker worker_;
 };
 
@@ -57,12 +66,7 @@
     worker_.data1 = &hook_data;
     worker_.data2 = &return_value;
 
-    const bool synchronous = GetParam();
-    if (synchronous) {
-      vp9_get_worker_interface()->execute(&worker_);
-    } else {
-      vp9_get_worker_interface()->launch(&worker_);
-    }
+    Run(&worker_);
     EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
     EXPECT_FALSE(worker_.had_error);
     EXPECT_EQ(5, hook_data);
@@ -81,12 +85,7 @@
   worker_.data1 = &hook_data;
   worker_.data2 = &return_value;
 
-  const bool synchronous = GetParam();
-  if (synchronous) {
-    vp9_get_worker_interface()->execute(&worker_);
-  } else {
-    vp9_get_worker_interface()->launch(&worker_);
-  }
+  Run(&worker_);
   EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
   EXPECT_EQ(1, worker_.had_error);
 
@@ -99,6 +98,39 @@
   EXPECT_FALSE(worker_.had_error);
 }
 
+TEST_P(VP9WorkerThreadTest, EndWithoutSync) {
+  // Create a large number of threads to increase the chances of detecting a
+  // race. Doing more work in the hook is no guarantee as any race would occur
+  // post hook execution in the main thread loop driver.
+  static const int kNumWorkers = 64;
+  VP9Worker workers[kNumWorkers];
+  int hook_data[kNumWorkers];
+  int return_value[kNumWorkers];
+
+  for (int n = 0; n < kNumWorkers; ++n) {
+    vp9_get_worker_interface()->init(&workers[n]);
+    return_value[n] = 1;  // return successfully from the hook
+    workers[n].hook = ThreadHook;
+    workers[n].data1 = &hook_data[n];
+    workers[n].data2 = &return_value[n];
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int n = 0; n < kNumWorkers; ++n) {
+      EXPECT_NE(vp9_get_worker_interface()->reset(&workers[n]), 0);
+      hook_data[n] = 0;
+    }
+
+    for (int n = 0; n < kNumWorkers; ++n) {
+      Run(&workers[n]);
+    }
+
+    for (int n = kNumWorkers - 1; n >= 0; --n) {
+      vp9_get_worker_interface()->end(&workers[n]);
+    }
+  }
+}
+
 TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
   EXPECT_EQ(0, vp9_set_worker_interface(NULL));
   EXPECT_TRUE(vp9_get_worker_interface() != NULL);

diff --git a/test/vpx_temporal_svc_encoder.sh b/test/vpx_temporal_svc_encoder.sh
index b2e968f..fcc8cb4 100755
--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh

@@ -21,6 +21,10 @@
     echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
+  if [ "$(vpx_config_option_enabled CONFIG_TEMPORAL_DENOISING)" != "yes" ]; then
+    elog "Warning: Temporal denoising is disabled! Spatial denoising will be " \
+      "used instead, which is probably not what you want for this test."
+  fi
 }
 
 # Runs vpx_temporal_svc_encoder using the codec specified by $1 and output file
@@ -44,8 +48,8 @@
     return 1
   fi
 
-  eval "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" "${codec}" \
-      "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
       "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
       "$@" \
       ${devnull}

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 29fea61..1d763b6 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -386,6 +386,7 @@
 
 decode_exit:
     pbi->common.error.setjmp = 0;
+    vp8_clear_system_state();
     return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 09854a5..373dbeb 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -4886,6 +4886,7 @@
     if (setjmp(cpi->common.error.jmp))
     {
         cpi->common.error.setjmp = 0;
+        vp8_clear_system_state();
         return VPX_CODEC_CORRUPT_FRAME;
     }
 

diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index b84795c..3a4cf7e 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c

@@ -30,9 +30,9 @@
                                           _mm_srli_si128(hg_fe_dc_ba, 8));
   const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
                                          _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
+  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
 
-  return abs(sum_diff);
+  return sum_diff;
 }
 
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 56394fb..b695ddc 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -409,6 +409,7 @@
                 if (setjmp(pbi->common.error.jmp))
                 {
                     pbi->common.error.setjmp = 0;
+                    vp8_clear_system_state();
                     /* same return value as used in vp8dx_receive_compressed_data */
                     return -1;
                 }

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index e1753a1..afe831a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -257,10 +257,14 @@
   xd->mi_stride = cm->mi_stride;
 }
 
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
 static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
                                                   int ctx) {
-  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
-                                     : cm->fc.partition_prob[ctx];
+  return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
+                                 : cm->fc.partition_prob[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -299,10 +303,6 @@
                   cm->prev_mip + cm->mi_stride + 1 : NULL;
 }
 
-static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             int mi_row, int mi_col,
                                             BLOCK_SIZE subsize,

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index d4fcb62..86ae648 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -151,7 +151,7 @@
   return clamped_mv;
 }
 
-static MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+static MV average_split_mvs(const struct macroblockd_plane *pd,
                             const MODE_INFO *mi, int ref, int block) {
   const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
   MV res = {0, 0};
@@ -190,7 +190,7 @@
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
     const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, plane, mi, ref, block)
+               ? average_split_mvs(pd, mi, ref, block)
                : mi->mbmi.mv[ref].as_mv;
 
     // TODO(jkoleszar): This clamping is done in the incorrect place for the
@@ -288,7 +288,7 @@
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
     const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, plane, mi, ref, block)
+               ? average_split_mvs(pd, mi, ref, block)
                : mi->mbmi.mv[ref].as_mv;
 
 
@@ -389,7 +389,7 @@
       }
 
       // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
+      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
           y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
         uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
         // Extend the border.

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f52dccb..99f5a89 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -717,6 +717,9 @@
 add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
 
+add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
 
@@ -783,12 +786,10 @@
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
 add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_refining_search_sad sse3/;
-$vp9_refining_search_sad_sse3=vp9_refining_search_sadx4;
+specialize qw/vp9_refining_search_sad/;
 
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad sse3/;
-$vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
+specialize qw/vp9_diamond_search_sad/;
 
 add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_full_range_search/;

diff --git a/vp9/decoder/vp9_thread.c b/vp9/common/vp9_thread.c
similarity index 95%
rename from vp9/decoder/vp9_thread.c
rename to vp9/common/vp9_thread.c
index 348bdf6..1c6aec0 100644
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/common/vp9_thread.c

@@ -11,7 +11,7 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 08ad4e1fecba302bf1247645e84a7d2779956bc3  src/utils/thread.c
+//  100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784  src/utils/thread.c
 
 #include <assert.h>
 #include <string.h>   // for memset()
@@ -144,18 +144,19 @@
 }
 
 static void end(VP9Worker *const worker) {
-  if (worker->status_ >= OK) {
 #if CONFIG_MULTITHREAD
+  if (worker->impl_ != NULL) {
     change_state(worker, NOT_OK);
     pthread_join(worker->impl_->thread_, NULL);
     pthread_mutex_destroy(&worker->impl_->mutex_);
     pthread_cond_destroy(&worker->impl_->condition_);
-#else
-    worker->status_ = NOT_OK;
-#endif
+    vpx_free(worker->impl_);
+    worker->impl_ = NULL;
   }
-  vpx_free(worker->impl_);
-  worker->impl_ = NULL;
+#else
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
+#endif
   assert(worker->status_ == NOT_OK);
 }
 

diff --git a/vp9/decoder/vp9_thread.h b/vp9/common/vp9_thread.h
similarity index 100%
rename from vp9/decoder/vp9_thread.h
rename to vp9/common/vp9_thread.h


diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 8b96abb..55d5b4f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -28,6 +28,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -38,7 +39,6 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -605,8 +605,8 @@
                              : literal_to_filter[vp9_rb_read_literal(rb, 2)];
 }
 
-static void read_frame_size(struct vp9_read_bit_buffer *rb,
-                            int *width, int *height) {
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+                         int *width, int *height) {
   const int w = vp9_rb_read_literal(rb, 16) + 1;
   const int h = vp9_rb_read_literal(rb, 16) + 1;
   *width = w;
@@ -617,7 +617,7 @@
   cm->display_width = cm->width;
   cm->display_height = cm->height;
   if (vp9_rb_read_bit(rb))
-    read_frame_size(rb, &cm->display_width, &cm->display_height);
+    vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
@@ -649,7 +649,7 @@
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   int width, height;
-  read_frame_size(rb, &width, &height);
+  vp9_read_frame_size(rb, &width, &height);
   apply_frame_size(cm, width, height);
   setup_display_size(cm, rb);
 }
@@ -669,7 +669,7 @@
   }
 
   if (!found)
-    read_frame_size(rb, &width, &height);
+    vp9_read_frame_size(rb, &width, &height);
 
   // Check that each of the frames that this frame references has valid
   // dimensions.
@@ -1053,20 +1053,17 @@
   return bit_reader_end;
 }
 
-static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
-      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
-      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
-    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                       "Invalid frame sync code");
-  }
-}
-
 static void error_handler(void *data) {
   VP9_COMMON *const cm = (VP9_COMMON *)data;
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
+  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
 static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
   int profile = vp9_rb_read_bit(rb);
   profile |= vp9_rb_read_bit(rb) << 1;
@@ -1112,7 +1109,9 @@
   cm->error_resilient_mode = vp9_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
-    check_sync_code(cm, rb);
+    if (!vp9_read_sync_code(rb))
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame sync code");
     if (cm->profile > PROFILE_1)
       cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
     cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
@@ -1150,9 +1149,18 @@
         0 : vp9_rb_read_literal(rb, 2);
 
     if (cm->intra_only) {
-      check_sync_code(cm, rb);
+      if (!vp9_read_sync_code(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+
+      // NOTE: The intra-only frame header does not include the specification of
+      // either the color format or color sub-sampling. VP9 specifies that the
+      // default color space should be YUV 4:2:0 in this case (normative).
+      cm->color_space = BT_601;
+      cm->subsampling_y = cm->subsampling_x = 1;
+
       setup_frame_size(cm, rb);
     } else {
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);

diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index fb15645..e5d9d62 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h

@@ -18,6 +18,7 @@
 
 struct VP9Common;
 struct VP9Decoder;
+struct vp9_read_bit_buffer;
 
 void vp9_init_dequantizer(struct VP9Common *cm);
 
@@ -25,6 +26,10 @@
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end);
 
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+                         int *width, int *height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index d154e9d..e32637b 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -85,7 +85,6 @@
   VP9_COMMON *const cm = &pbi->common;
   int i;
 
-  vp9_remove_common(cm);
   vp9_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi->tile_data);
@@ -103,6 +102,7 @@
     vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows);
   }
 
+  vp9_remove_common(cm);
   vpx_free(pbi);
 }
 
@@ -246,6 +246,7 @@
 
   if (setjmp(cm->error.jmp)) {
     cm->error.setjmp = 0;
+    vp9_clear_system_state();
 
     // We do not know if the missing frame(s) was supposed to update
     // any of the reference buffers, but we act conservative and
@@ -267,7 +268,10 @@
 
   vp9_decode_frame(pbi, source, source + size, psource);
 
-  swap_frame_buffers(pbi);
+  if (!cm->show_existing_frame)
+    swap_frame_buffers(pbi);
+  else
+    cm->frame_to_show = get_frame_new_buffer(cm);
 
   vp9_clear_system_state();
 
@@ -291,6 +295,7 @@
 
 int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp9_ppflags_t *flags) {
+  VP9_COMMON *const cm = &pbi->common;
   int ret = -1;
 #if !CONFIG_VP9_POSTPROC
   (void)*flags;
@@ -300,15 +305,20 @@
     return ret;
 
   /* no raw frame to show!!! */
-  if (pbi->common.show_frame == 0)
+  if (!cm->show_frame)
     return ret;
 
   pbi->ready_for_new_data = 1;
 
 #if CONFIG_VP9_POSTPROC
-  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+  if (!cm->show_existing_frame) {
+    ret = vp9_post_proc_frame(cm, sd, flags);
+  } else {
+    *sd = *cm->frame_to_show;
+    ret = 0;
+  }
 #else
-  *sd = *pbi->common.frame_to_show;
+  *sd = *cm->frame_to_show;
   ret = 0;
 #endif /*!CONFIG_POSTPROC*/
   vp9_clear_system_state();

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index ab4f9a2..8e16e1c 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -18,10 +18,9 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_thread.h"
 
-#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_thread.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index a727e2a..423bd88 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h

@@ -12,8 +12,8 @@
 #define VP9_DECODER_VP9_DTHREAD_H_
 
 #include "./vpx_config.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"
 
 struct VP9Common;
 struct VP9Decoder;

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index f6393e0..dbf8cd7 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -190,12 +190,19 @@
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
 
+  MB_MODE_INFO saved_mbmi;
+  int i, j;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+
   // We will restore these after motion compensation.
-  MB_MODE_INFO saved_mbmi = *mbmi;
-  struct buf_2d saved_dst = filter_mbd->plane[0].dst;
-  struct buf_2d saved_pre[2];
-  saved_pre[0] = filter_mbd->plane[0].pre[0];
-  saved_pre[1] = filter_mbd->plane[0].pre[1];
+  saved_mbmi = *mbmi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
 
   mv_col = denoiser->best_sse_mv.as_mv.col;
   mv_row = denoiser->best_sse_mv.as_mv.row;
@@ -224,67 +231,52 @@
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
-  filter_mbd->plane[0].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[0].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[1].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
+  for (j = 0; j < 2; ++j) {
+    filter_mbd->plane[0].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].y_buffer,
+                    denoiser->running_avg_y[frame].y_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[0].pre[j].stride =
+        denoiser->running_avg_y[frame].y_stride;
+    filter_mbd->plane[1].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].u_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[1].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+    filter_mbd->plane[2].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].v_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[2].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+  }
   filter_mbd->plane[0].dst.buf =
       block_start(denoiser->mc_running_avg_y.y_buffer,
                   denoiser->mc_running_avg_y.y_stride,
                   mi_row, mi_col);
   filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
   filter_mbd->plane[1].dst.buf =
       block_start(denoiser->mc_running_avg_y.u_buffer,
                   denoiser->mc_running_avg_y.uv_stride,
                   mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
   filter_mbd->plane[2].dst.buf =
       block_start(denoiser->mc_running_avg_y.v_buffer,
                   denoiser->mc_running_avg_y.uv_stride,
                   mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.y_stride;
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
 
   vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
 
   // Restore everything to its original state
-  filter_mbd->plane[0].pre[0] = saved_pre[0];
-  filter_mbd->plane[0].pre[1] = saved_pre[1];
-  filter_mbd->plane[0].dst = saved_dst;
   *mbmi = saved_mbmi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
+    }
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
 
   mv_row = denoiser->best_sse_mv.as_mv.row;
   mv_col = denoiser->best_sse_mv.as_mv.col;

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index eb9624d..cd0191e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -320,10 +320,10 @@
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
-                           scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1c514ba..beb2117 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -862,9 +862,7 @@
 
 #if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    yuv_denoised_file = fopen("denoised.yuv", "ab");
-  }
+  yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
 #ifdef OUTPUT_YUV_SRC
@@ -1122,9 +1120,7 @@
 
 #if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    fclose(yuv_denoised_file);
-  }
+  fclose(yuv_denoised_file);
 #endif
 #endif
 #ifdef OUTPUT_YUV_SRC

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c66e003..9f8b37f 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -338,7 +338,6 @@
   CYCLIC_REFRESH *cyclic_refresh;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
   vp9_full_search_fn_t full_search_sad;
   vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9eb2fbc..6e04e2a 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -56,7 +56,7 @@
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
-        &sse);
+        &sse, NULL, 0, 0);
   }
 
   xd->mi[0]->mbmi.mode = NEWMV;

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index c0edf45..4a3f895 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -172,15 +172,15 @@
   return &buf[(r >> 3) * stride + (c >> 3)];
 }
 
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-             src_stride, &sse)
-
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
-    thismse = (DIST(r, c));                                            \
+    if (second_pred == NULL)                                           \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                             src_stride, &sse);                        \
+    else                                                               \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                              z, src_stride, &sse, second_pred);       \
     if ((v = MVC(r, c) + thismse) < besterr) {                         \
       besterr = v;                                                     \
       br = r;                                                          \
@@ -266,105 +266,9 @@
                                  int iters_per_step,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
-                                 unsigned int *sse1) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  int thismse;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
-  }
-  tr = br;
-  tc = bc;
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void) tr;
-  (void) tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
-  return besterr;
-}
-
-#undef DIST
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-              z, src_stride, &sse, second_pred)
-
-int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
-                                      MV *bestmv, const MV *ref_mv,
-                                      int allow_hp,
-                                      int error_per_bit,
-                                      const vp9_variance_fn_ptr_t *vfp,
-                                      int forced_stop,
-                                      int iters_per_step,
-                                      int *mvjcost, int *mvcost[2],
-                                      int *distortion,
-                                      unsigned int *sse1,
-                                      const uint8_t *second_pred,
-                                      int w, int h) {
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h) {
   const uint8_t *const z = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -376,7 +280,6 @@
   const unsigned int quarteriters = iters_per_step;
   const unsigned int eighthiters = iters_per_step;
 
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -401,8 +304,13 @@
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
   // pixel search, and can be passed in this function.
-  vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
+  }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -456,7 +364,6 @@
 
 #undef MVC
 #undef PRE
-#undef DIST
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
@@ -944,94 +851,6 @@
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x,
                              const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv,
-                             int search_param, int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  // search_param determines the length of the initial step and hence the number
-  // of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
-  // (MAX_FIRST_STEP/4) pel... etc.
-  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  const uint8_t *best_address, *in_what_ref;
-  int best_sad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-  int i, j, step;
-
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  in_what_ref = get_buf_from_mv(in_what, ref_mv);
-  best_address = in_what_ref;
-  *num00 = 0;
-  *best_mv = *ref_mv;
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         best_address, in_what->stride) +
-      mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j++) {
-      const MV mv = {best_mv->row + ss[i].mv.row,
-                     best_mv->col + ss[i].mv.col};
-      if (is_mv_in(x, &mv)) {
-       int sad = fn_ptr->sdf(what->buf, what->stride,
-                             best_address + ss[i].offset, in_what->stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = i;
-          }
-        }
-      }
-
-      i++;
-    }
-
-    if (best_site != last_site) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
-      while (1) {
-        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
-                            best_mv->col + ss[best_site].mv.col};
-        if (is_mv_in(x, &this_mv)) {
-          int sad = fn_ptr->sdf(what->buf, what->stride,
-                                best_address + ss[best_site].offset,
-                                in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
-              continue;
-            }
-          }
-        }
-        break;
-      };
-#endif
-    } else if (best_address == in_what_ref) {
-      (*num00)++;
-    }
-  }
-  return best_sad;
-}
-
-int vp9_diamond_search_sadx4(const MACROBLOCK *x,
-                             const search_site_config *cfg,
                              MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
                              const vp9_variance_fn_ptr_t *fn_ptr,
@@ -1152,8 +971,7 @@
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
                                              in_what_stride);
           if (thissad < bestsad) {
-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                      mvjsadcost, mvsadcost, sad_per_bit);
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
               best_mv->row += ss[best_site].mv.row;
@@ -1427,51 +1245,6 @@
                               int search_range,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
-  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-                                     get_buf_from_mv(in_what, ref_mv),
-                                     in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-
-    for (j = 0; j < 4; j++) {
-      const MV mv = {ref_mv->row + neighbors[j].row,
-                     ref_mv->col + neighbors[j].col};
-      if (is_mv_in(x, &mv)) {
-        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = j;
-          }
-        }
-      }
-    }
-
-    if (best_site == -1) {
-      break;
-    } else {
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
-    }
-  }
-  return best_sad;
-}
-
-int vp9_refining_search_sadx4(const MACROBLOCK *x,
-                              MV *ref_mv, int error_per_bit,
-                              int search_range,
-                              const vp9_variance_fn_ptr_t *fn_ptr,
-                              const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   const struct buf_2d *const what = &x->plane[0].src;

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 07e410d..366f9af 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -98,27 +98,12 @@
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
-    int *mvjcost,
-    int *mvcost[2],
-    int *distortion,
-    unsigned int *sse);
-
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
-
-typedef int (fractional_mv_step_comp_fp) (
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
-    int allow_hp,
-    int error_per_bit,
-    const vp9_variance_fn_ptr_t *vfp,
-    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
-    int iters_per_step,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
-extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 7515f44..b2ab714 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -106,26 +106,31 @@
   return const_motion;
 }
 
-static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv, int *rate_mv) {
+static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int_mv *tmp_mv, int *rate_mv,
+                                  int64_t best_rd_sofar) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  int step_param;
-  int sadpb = x->sadperbit16;
+  const int step_param = cpi->sf.mv.fullpel_search_step_param;
+  const int sadpb = x->sadperbit16;
   MV mvp_full;
-  int ref = mbmi->ref_frame[0];
+  const int ref = mbmi->ref_frame[0];
   const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
-  int i;
-
-  int tmp_col_min = x->mv_col_min;
-  int tmp_col_max = x->mv_col_max;
-  int tmp_row_min = x->mv_row_min;
-  int tmp_row_max = x->mv_row_max;
-
+  int dis;
+  int rate_mode;
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
+  int rv = 0;
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
+  if (cpi->common.show_frame &&
+      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
+    return rv;
+
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
@@ -133,28 +138,10 @@
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
-
     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
-
   vp9_set_mv_search_range(x, &ref_mv);
 
-  // TODO(jingning) exploiting adaptive motion search control in non-RD
-  // mode decision too.
-  step_param = cpi->sf.mv.fullpel_search_step_param;
-
-  for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
-    if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-      tmp_mv->as_int = INVALID_MV;
-
-      if (scaled_ref_frame) {
-        int i;
-        for (i = 0; i < MAX_MB_PLANE; i++)
-          xd->plane[i].pre[0] = backup_yv12[i];
-      }
-      return;
-    }
-  }
   assert(x->mv_best_ref_index[ref] <= 2);
   if (x->mv_best_ref_index[ref] < 2)
     mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
@@ -172,60 +159,39 @@
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
-  }
-
   // calculate the bit cost on motion vector
   mvp_full.row = tmp_mv->as_mv.row * 8;
   mvp_full.col = tmp_mv->as_mv.col * 8;
+
   *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-}
 
-static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    MV *tmp_mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  int ref = mbmi->ref_frame[0];
-  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
-  int dis;
+  rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]]
+                                  [INTER_OFFSET(NEWMV)];
+  rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) >
+         best_rd_sofar);
 
-  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
-                                                                        ref);
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
-
-    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  if (rv) {
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                 cpi->common.allow_high_precision_mv,
+                                 x->errorperbit,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.mv.subpel_force_stop,
+                                 cpi->sf.mv.subpel_iters_per_step,
+                                 x->nmvjointcost, x->mvcost,
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+    x->pred_mv[ref] = tmp_mv->as_mv;
   }
 
-  cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
-                               cpi->common.allow_high_precision_mv,
-                               x->errorperbit,
-                               &cpi->fn_ptr[bsize],
-                               cpi->sf.mv.subpel_force_stop,
-                               cpi->sf.mv.subpel_iters_per_step,
-                               x->nmvjointcost, x->mvcost,
-                               &dis, &x->pred_sse[ref]);
-
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = backup_yv12[i];
   }
-
-  x->pred_mv[ref] = *tmp_mv;
+  return rv;
 }
 
+
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
                               int *out_rate_sum, int64_t *out_dist_sum,
@@ -544,28 +510,17 @@
         continue;
 
       if (this_mode == NEWMV) {
-        int rate_mode = 0;
         if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
-
-        full_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
-
-        if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                    &frame_mv[NEWMV][ref_frame],
+                                    &rate_mv, best_rd))
           continue;
-
-        rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                        [INTER_OFFSET(this_mode)];
-        if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd)
-          continue;
-
-        sub_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
-      if (this_mode != NEARESTMV)
-        if (frame_mv[this_mode][ref_frame].as_int ==
-            frame_mv[NEARESTMV][ref_frame].as_int)
+      if (this_mode != NEARESTMV &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[NEARESTMV][ref_frame].as_int)
           continue;
 
       mbmi->mode = this_mode;

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 4964e0f..370e1ce 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -104,6 +104,49 @@
   *eob_ptr = eob + 1;
 }
 
+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a8daa21..998fb3c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1422,7 +1422,8 @@
                                          cpi->sf.mv.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion,
-                                         &x->pred_sse[mbmi->ref_frame[0]]);
+                                         &x->pred_sse[mbmi->ref_frame[0]],
+                                         NULL, 0, 0);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1838,7 +1839,7 @@
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
-                                 &dis, &x->pred_sse[ref]);
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -1954,7 +1955,7 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step_comp(
+      bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
           cpi->common.allow_high_precision_mv,

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 98d6825..f271182 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -253,6 +253,7 @@
   }
 
   if (speed >= 5) {
+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
     sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
@@ -265,7 +266,6 @@
     sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
-    sf->mv.search_method = FAST_DIAMOND;
     sf->allow_skip_recode = 0;
   }
 
@@ -287,7 +287,7 @@
     sf->mv.reduce_first_step_size = 1;
   }
   if (speed >= 7) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
+    sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
@@ -396,7 +396,6 @@
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
-    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index f4b4e04..caa831c 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -178,7 +178,7 @@
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
                                          NULL, NULL,
-                                         &distortion, &sse);
+                                         &distortion, &sse, NULL, 0, 0);
 
   // Restore input state
   x->plane[0].src = src;

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 2d9f2b0..508e1d4 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -234,21 +234,18 @@
   movifnidn                   quantq, quantmp
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-; TODO(jingning) to be continued with 32x32 quantization process
+%ifidn %1, fp_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
-  paddw                           m0, m5
   paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
   mova                            m3, [r2q]                ; m3 = dequant
   mov                             r3, qcoeffmp
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
@@ -275,18 +272,19 @@
   psignw                         m13, m10                  ; m13 = reinsert sign
   mova        [qcoeffq+ncoeffq*2+ 0], m8
   mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
 %endif
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   psrlw                           m8, 1
   psrlw                          m13, 1
   psignw                          m8, m9
   psignw                         m13, m10
+  psrlw                           m0, m3, 2
 %endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   mova       [dqcoeffq+ncoeffq*2+16], m13
@@ -307,13 +305,17 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
   pmovmskb                        r6, m7
-  pmovmskb                        r2, m7
+  pmovmskb                        r2, m12
+
   or                              r6, r2
   jz .skip_iter
 %endif
+  pcmpeqw                         m7, m7
+
   paddsw                          m6, m1                   ; m6 += round
   paddsw                         m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
@@ -322,13 +324,13 @@
   psignw                         m13, m10                  ; m13 = reinsert sign
   mova        [qcoeffq+ncoeffq*2+ 0], m14
   mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
 %endif
   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   psrlw                          m14, 1
   psrlw                          m13, 1
   psignw                         m14, m9
@@ -349,7 +351,7 @@
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
   mova        [qcoeffq+ncoeffq*2+ 0], m5
@@ -397,3 +399,4 @@
 
 INIT_XMM ssse3
 QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8c1f345..81fe6a6 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -50,6 +50,8 @@
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b150161..24dcbfa 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -353,7 +353,7 @@
 
   oxcf->key_freq               = cfg->kf_max_dist;
 
-  oxcf->speed                  =  clamp(abs(extra_cfg->cpu_used), 0, 7);
+  oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
   oxcf->play_alternate         =  extra_cfg->enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index c3ca7ee..2591852 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_frame_buffers.h"
 
 #include "vp9/decoder/vp9_decoder.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
@@ -98,8 +99,10 @@
 static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
                                                 unsigned int data_sz,
                                                 vpx_codec_stream_info_t *si,
+                                                int *is_intra_only,
                                                 vpx_decrypt_cb decrypt_cb,
                                                 void *decrypt_state) {
+  int intra_only_flag = 0;
   uint8_t clear_buffer[9];
 
   if (data + data_sz <= data)
@@ -115,6 +118,8 @@
   }
 
   {
+    int show_frame;
+    int error_resilient;
     struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
     const int frame_marker = vp9_rb_read_literal(&rb, 2);
     const int version = vp9_rb_read_bit(&rb);
@@ -126,6 +131,7 @@
     if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
+      vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
@@ -133,18 +139,15 @@
       return VPX_CODEC_UNSUP_BITSTREAM;
 
     si->is_kf = !vp9_rb_read_bit(&rb);
+    show_frame = vp9_rb_read_bit(&rb);
+    error_resilient = vp9_rb_read_bit(&rb);
+
     if (si->is_kf) {
       const int sRGB = 7;
       int colorspace;
 
-      rb.bit_offset += 1;  // show frame
-      rb.bit_offset += 1;  // error resilient
-
-      if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
-          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
-          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
+      if (!vp9_read_sync_code(&rb))
         return VPX_CODEC_UNSUP_BITSTREAM;
-      }
 
       colorspace = vp9_rb_read_literal(&rb, 3);
       if (colorspace != sRGB) {
@@ -161,20 +164,28 @@
           return VPX_CODEC_UNSUP_BITSTREAM;
         }
       }
+      vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+    } else {
+      intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 
-      // TODO(jzern): these are available on non-keyframes in intra only mode.
-      si->w = vp9_rb_read_literal(&rb, 16) + 1;
-      si->h = vp9_rb_read_literal(&rb, 16) + 1;
+      if (intra_only_flag) {
+        if (!vp9_read_sync_code(&rb))
+          return VPX_CODEC_UNSUP_BITSTREAM;
+        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
+        vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+      }
     }
   }
-
+  if (is_intra_only != NULL)
+    *is_intra_only = intra_only_flag;
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decoder_peek_si(const uint8_t *data,
                                        unsigned int data_sz,
                                        vpx_codec_stream_info_t *si) {
-  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL);
+  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
 }
 
 static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
@@ -266,13 +277,14 @@
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
   if (!ctx->si.h) {
+    int is_intra_only = 0;
     const vpx_codec_err_t res =
-        decoder_peek_si_internal(*data, data_sz, &ctx->si, ctx->decrypt_cb,
-                                 ctx->decrypt_state);
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
+                                 ctx->decrypt_cb, ctx->decrypt_state);
     if (res != VPX_CODEC_OK)
       return res;
 
-    if (!ctx->si.is_kf)
+    if (!ctx->si.is_kf && !is_intra_only)
       return VPX_CODEC_ERROR;
   }
 

diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 92ec6fd..1fcb36f 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk

@@ -31,8 +31,6 @@
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
-VP9_DX_SRCS-yes += decoder/vp9_thread.c
-VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
commit	af4f390fff466f27e2b9f9f9e3d524e0f041e00b	[log] [tgz]
author	Adrian Grange <agrange@google.com>	Fri Jul 11 10:52:09 2014 -0700
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	Fri Jul 11 10:52:09 2014 -0700
tree	077d7d5deb0035b02ac26f8e4e43870961cab18c
parent	a75d55df1b362cb608f1ea6b70154cf166c32a3f [diff]
parent	e3e6e06155e3bdefac9775b6b50d84aaf1aca06c [diff]