Merge "Speed up idct8x8 by rearrange instructions. Speed improve from 264% ~ 270% to 280% ~ 300% base on assembly-perf."
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index e05d482..9cfa386 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -36,29 +38,9 @@
 }
 #endif
 
-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -69,7 +51,8 @@
   }
 }
 
-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
   // First transform columns
   for (int i = 0; i < 32; ++i) {
     double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@
   }
 }
 
-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
 
-    for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
       src[j] = rnd.Rand8();
       dst[j] = rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
     }
+
+    const int pitch = 64;
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
       in[j] = src[j] - dst[j];
+    }
 
     reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
+    for (int j = 0; j < kNumCoeffs; ++j)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
@@ -121,72 +242,21 @@
   }
 }
 
-TEST(VP9Fdct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;
 
-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 1024; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
-    }
-  }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 81d242b..ee6c9f6 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,14 +13,16 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx_ports/mem.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -62,6 +64,7 @@
       inv_txfm = iht8x8_add;
     }
   }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -92,8 +95,9 @@
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -121,8 +125,9 @@
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -165,9 +170,11 @@
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    for (int j = 0; j < 64; ++j) {
+        if (test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
           test_temp_block[j] /= 4;
           test_temp_block[j] *= 4;
@@ -177,7 +184,9 @@
           test_temp_block[j] *= 4;
         }
     }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
@@ -216,8 +225,12 @@
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 579d7e2..370ffc1 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -522,3 +522,5 @@
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
 495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
 65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5
diff --git a/test/test.mk b/test/test.mk
index 2042c86..4eb599d 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -629,5 +629,7 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 4cd356d..9bd03b9 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,7 @@
   "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
 #if CONFIG_NON420
   "vp91-2-04-yv444.webm"
 #endif
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 699b44a..f138c09 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -341,7 +341,7 @@
                                ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                                const int16_t **scan,
                                const uint8_t **band_translate) {
-  ENTROPY_CONTEXT above_ec, left_ec;
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
     case TX_4X4:
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index c6eefda..2e973e5 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -149,8 +149,6 @@
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
-  if (!incr)
-    return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
   comp_counts->sign[s] += incr;
@@ -177,35 +175,24 @@
   }
 }
 
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}
 
 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   ++counts->joints[j];
 
-  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+  if (mv_joint_vertical(j)) {
+    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+  }
 
-  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+  if (mv_joint_horizontal(j)) {
+    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+  }
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
   return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
-  counts_to_context(&nmv_count->comps[0], usehp);
-  counts_to_context(&nmv_count->comps[1], usehp);
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                 vp9_tree tree,
                                 vp9_prob this_probs[],
@@ -235,8 +222,6 @@
   nmv_context *pre_ctx = &pre_fc->nmvc;
   nmv_context_counts *cts = &cm->counts.mv;
 
-  vp9_counts_process(cts, allow_hp);
-
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index cfa61c2..df806ac 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -777,6 +777,7 @@
     }
   }
 }
+#if CONFIG_NON420
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       const MODE_INFO *mi,
@@ -896,6 +897,7 @@
     dst->buf += 8 * dst->stride;
   }
 }
+#endif
 
 static void filter_block_plane(VP9_COMMON *const cm,
                                struct macroblockd_plane *const plane,
@@ -981,8 +983,10 @@
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
   int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
       xd->plane[1].subsampling_x == 1);
+#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -993,16 +997,22 @@
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
       if (use_420)
+#endif
         setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
+#if CONFIG_NON420
         if (use_420)
+#endif
           filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
                              &lfm);
+#if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
                                     mi_row, mi_col);
+#endif
       }
     }
   }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index c2777aa..f5eeb2c 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -701,7 +701,7 @@
 specialize vp9_quantize_b $ssse3_x86_64
 
 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #
 # Structured Similarity (SSIM)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8dfb22c..7f23dc1 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -460,6 +460,7 @@
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
+    assert(bsize >= BLOCK_8X8);
   } else {
     if (bsize >= BLOCK_8X8)
       mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index e41ea54..6cb7c09 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -454,8 +454,7 @@
 
 static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   const int old = *delta_q;
-  if (vp9_rb_read_bit(rb))
-    *delta_q = vp9_rb_read_signed_literal(rb, 4);
+  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
   return old != *delta_q;
 }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eb83903..45758e7 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -851,13 +851,75 @@
   }
 }
 
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if ((rows_left <= 0) || (cols_left <= 0)) {
+    return MIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; --bsize) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
+  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
   int block_row, block_col;
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; ++block_row) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; ++block_col) {
+        m[block_row * mis + block_col].mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    int bh = num_8x8_blocks_high_lookup[bsize];
+    int bw = num_8x8_blocks_wide_lookup[bsize];
+    int sub_block_row;
+    int sub_block_col;
+    int row_index;
+    int col_index;
+
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        // Find a partition size that fits
+        bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                    (row8x8_remaining - block_row),
+                                    (col8x8_remaining - block_col), &bh, &bw);
+
+        // Set the mi entries for all 8x8 blocks within the selected size
+        for (sub_block_row = 0; sub_block_row < bh; ++sub_block_row) {
+          for (sub_block_col = 0; sub_block_col < bw; ++sub_block_col) {
+            row_index = block_row + sub_block_row;
+            col_index = block_col + sub_block_col;
+            m[row_index * mis + col_index].mbmi.sb_type = bsize;
+          }
+        }
+      }
     }
   }
 }
@@ -1946,7 +2008,7 @@
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
         set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+        set_partitioning(cpi, m, mi_row, mi_col);
         rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else if (cpi->sf.partition_by_variance) {
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 1203c00..9977289 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -155,7 +155,6 @@
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
-  vp9_counts_process(nmv_count, usehp);
   vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
                                    prob->joints,
                                    branch_ct_joint,
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fb0e470..96abeff 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -135,6 +135,7 @@
 
       if (x >= zbin) {
         x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        x  = clamp(x, INT16_MIN, INT16_MAX);
         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
               quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
 
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 7deb981..ae0d6cd 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -70,9 +70,15 @@
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  paddsw                          m6, m1
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1
+%else
   paddw                           m6, m1                   ; m6 += round
   punpckhqdq                      m1, m1
   paddw                          m11, m1                   ; m11 += round
+%endif
   pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
   punpckhqdq                      m2, m2
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
@@ -126,9 +132,12 @@
   pmovmskb                        r2, m12
   or                              r6, r2
   jz .skip_iter
-%endif
+  paddsw                          m6, m1
+  paddsw                         m11, m1
+%else
   paddw                           m6, m1                   ; m6 += round
   paddw                          m11, m1                   ; m11 += round
+%endif
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6