Merge "API extensions and sample app for spacial scalable encoder"

diff --git a/configure b/configure
index a252081..297cec4 100755
--- a/configure
+++ b/configure

@@ -38,6 +38,7 @@
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
   ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
   ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -279,6 +280,7 @@
     dc_recon
     runtime_cpu_detect
     postproc
+    vp9_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -333,6 +335,7 @@
     dequant_tokens
     dc_recon
     postproc
+    vp9_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -438,7 +441,7 @@
     done
     enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
     enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
-    ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
     ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
     ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
     DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -647,6 +650,10 @@
         enable_feature dc_recon
     fi
 
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
+    fi
+
     # Enable the postbuild target if building for visual studio.
     case "$tgt_cc" in
         vs*) enable_feature msvs

diff --git a/test/acm_random.h b/test/acm_random.h
index cd33d12..de94186 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_ACM_RANDOM_H_
-#define LIBVPX_TEST_ACM_RANDOM_H_
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -59,4 +59,4 @@
 
 }  // namespace libvpx_test
 
-#endif  // LIBVPX_TEST_ACM_RANDOM_H_
+#endif  // TEST_ACM_RANDOM_H_

diff --git a/test/borders_test.cc b/test/borders_test.cc
index 7bfece8..efaedb9 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc

@@ -29,7 +29,7 @@
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if ( video->frame() == 1) {
+    if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, 0);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);

diff --git a/test/clear_system_state.h b/test/clear_system_state.h
index e240981..8f08a4c 100644
--- a/test/clear_system_state.h
+++ b/test/clear_system_state.h

@@ -10,7 +10,7 @@
 #ifndef TEST_CLEAR_SYSTEM_STATE_H_
 #define TEST_CLEAR_SYSTEM_STATE_H_
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 # include "vpx_ports/x86.h"

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 287e805..f020a99 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -75,7 +75,7 @@
     bits_in_buffer_model_ -= frame_size_in_bits;
 
     // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits ;
+    bits_total_ += frame_size_in_bits;
 
     // If first drop not set and we have a drop set it to this time.
     if (!first_drop_ && duration > 1)

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 0743f35..544d188 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -17,11 +17,11 @@
 
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -31,9 +31,9 @@
 #ifdef _MSC_VER
 static int round(double x) {
   if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 
@@ -45,7 +45,9 @@
       double s = 0;
       for (int i = 0; i < 16; ++i) {
         for (int j = 0; j < 16; ++j) {
-          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;
+          x = cos(PI * j * (l + 0.5) / 16.0) *
+              cos(PI * i * (k + 0.5) / 16.0) *
+              input[i * 16 + j] / 256;
           if (i != 0)
             x *= sqrt(2.0);
           if (j != 0)

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index e05d482..f331886 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -13,15 +13,17 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
+#include "./vpx_config.h"
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -30,35 +32,15 @@
 #ifdef _MSC_VER
 static int round(double x) {
   if (x < 0)
-    return (int)ceil(x - 0.5);
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 
-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
-  double x;
-  for (int l = 0; l < 32; ++l) {
-    for (int k = 0; k < 32; ++k) {
-      double s = 0;
-      for (int i = 0; i < 32; ++i) {
-        for (int j = 0; j < 32; ++j) {
-          x = cos(kPi * j * (l + 0.5) / 32.0) *
-              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
-          if (i != 0)
-            x *= sqrt(2.0);
-          if (j != 0)
-            x *= sqrt(2.0);
-          s += x;
-        }
-      }
-      output[k * 32 + l] = s / 4;
-    }
-  }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 32; k++) {
     out[k] = 0.0;
@@ -69,7 +51,8 @@
   }
 }
 
-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
   // First transform columns
   for (int i = 0; i < 32; ++i) {
     double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@
   }
 }
 
-TEST(VP9Idct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t in[1024], coeff[1024];
-    uint8_t dst[1024], src[1024];
-    double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
 
-    for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  fwd_txfm_t fwd_txfm_;
+  inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < kNumCoeffs; ++j) {
       src[j] = rnd.Rand8();
       dst[j] = rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
     }
+
+    const int pitch = 64;
+    REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+    REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      const uint32_t diff = dst[j] - src[j];
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = 255;
+    if (i == 1)
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+    REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
       in[j] = src[j] - dst[j];
+    }
 
     reference_32x32_dct_2d(in, out_r);
-    for (int j = 0; j < 1024; j++)
+    for (int j = 0; j < kNumCoeffs; ++j)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_add_c(coeff, dst, 32);
-    for (int j = 0; j < 1024; ++j) {
+    REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    for (int j = 0; j < kNumCoeffs; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
@@ -121,72 +242,21 @@
   }
 }
 
-TEST(VP9Fdct32x32Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  unsigned int max_error = 0;
-  int64_t total_error = 0;
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[1024];
-    int16_t test_temp_block[1024];
-    uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;
 
-    for (int j = 0; j < 1024; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
 
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
-    for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = dst[j] - src[j];
-      const unsigned error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-
-  EXPECT_GE(1u, max_error)
-      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input_block[1024], input_extreme_block[1024];
-    int16_t output_block[1024], output_extreme_block[1024];
-
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 1024; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-    }
-    if (i == 0)
-      for (int j = 0; j < 1024; ++j)
-        input_extreme_block[j] = 255;
-
-    const int pitch = 64;
-    vp9_short_fdct32x32_c(input_block, output_block, pitch);
-    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < 1024; ++j) {
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
-          << "Error: 32x32 FDCT extreme has coefficient larger than "
-             "4*DCT_MAX_VALUE";
-    }
-  }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_short_fdct32x32_sse2,
+                   &vp9_short_idct32x32_add_sse2, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_sse2,
+                   &vp9_short_idct32x32_add_sse2, 1)));
+#endif
 }  // namespace

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 49e7384..055c45e 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -12,7 +12,7 @@
 #define TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 
 namespace libvpx_test {
@@ -36,9 +36,8 @@
 };
 
 // Provides a simplified interface to manage one video decoding.
-//
-// TODO: similar to Encoder class, the exact services should be
-// added as more tests are added.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
 class Decoder {
  public:
   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index df9e421..709831e 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/decode_test_driver.h"
@@ -114,19 +114,19 @@
   const unsigned int height_y = img1->d_h;
   unsigned int i;
   for (i = 0; i < height_y; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     width_y) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
   const unsigned int width_uv  = (img1->d_w + 1) >> 1;
   const unsigned int height_uv = (img1->d_h + 1) >> 1;
   for (i = 0; i <  height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
   for (i = 0; i < height_uv; ++i)
-    match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     width_uv) == 0) && match;
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
   return match;
 }
 

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 39f915c..ea40ca6 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
@@ -156,7 +156,7 @@
     RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
 
     for (int j = 0; j < 16; ++j) {
-        if(test_temp_block[j] > 0) {
+        if (test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
           test_temp_block[j] /= 4;
           test_temp_block[j] *= 4;

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 81d242b..ee6c9f6 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -13,14 +13,16 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
 #include "vpx_ports/mem.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -62,6 +64,7 @@
       inv_txfm = iht8x8_add;
     }
   }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -92,8 +95,9 @@
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -121,8 +125,9 @@
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 64; ++j)
       test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_output_block,
+                   NULL, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       if (test_output_block[j] < 0)
@@ -165,9 +170,11 @@
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    for (int j = 0; j < 64; ++j){
-        if(test_temp_block[j] > 0) {
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    for (int j = 0; j < 64; ++j) {
+        if (test_temp_block[j] > 0) {
           test_temp_block[j] += 2;
           test_temp_block[j] /= 4;
           test_temp_block[j] *= 4;
@@ -177,7 +184,9 @@
           test_temp_block[j] *= 4;
         }
     }
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
@@ -216,8 +225,12 @@
       test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+    REGISTER_STATE_CHECK(
+        RunFwdTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
+    REGISTER_STATE_CHECK(
+        RunInvTxfm(test_input_block, test_temp_block,
+                   dst, pitch, tx_type_));
 
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];

diff --git a/test/i420_video_source.h b/test/i420_video_source.h
index bcbe8a7..2bf2a03 100644
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h

@@ -11,6 +11,7 @@
 #define TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
+#include <string>
 
 #include "test/video_source.h"
 
@@ -34,7 +35,6 @@
         height_(0),
         framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
-
     // This initializes raw_sz_, width_, height_ and allocates an img.
     SetSize(width, height);
   }

diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 67db78b..fc8129e 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc

@@ -15,10 +15,10 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -27,10 +27,10 @@
 
 #ifdef _MSC_VER
 static int round(double x) {
-  if(x < 0)
-    return (int)ceil(x - 0.5);
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
   else
-    return (int)floor(x + 0.5);
+    return static_cast<int>(floor(x + 0.5));
 }
 #endif
 

diff --git a/test/idct_test.cc b/test/idct_test.cc
index aa786cb..ea61e0b 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc

@@ -16,7 +16,7 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
+typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
                           int pred_stride, unsigned char *dst_ptr,
                           int dst_stride);
 namespace {
@@ -34,7 +34,7 @@
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
   idct_fn_t UUT;
-  short input[16];
+  int16_t input[16];
   unsigned char output[256];
   unsigned char predict[256];
 };

diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index da96741..f5f6d5b 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc

@@ -15,8 +15,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -106,9 +106,9 @@
           for (int y = 0; y < block_size_; y++)
             sum += data_ptr_[p][y * stride_ - 1];
         expected = (sum + (1 << (shift - 1))) >> shift;
-      } else
+      } else {
         expected = 0x80;
-
+      }
       // check that all subsequent lines are equal to the first
       for (int y = 1; y < block_size_; ++y)
         ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],

diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 926f801..3fbafbd 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h

@@ -28,7 +28,7 @@
 // so that we can do actual file decodes.
 class IVFVideoSource : public CompressedVideoSource {
  public:
-  IVFVideoSource(const std::string &file_name)
+  explicit IVFVideoSource(const std::string &file_name)
       : file_name_(file_name),
         input_file_(NULL),
         compressed_frame_buf_(NULL),

diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index f7572e8..7ee2898 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc

@@ -132,7 +132,6 @@
   // Verify that keyframes match the file keyframes in the file.
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
        iter != kf_pts_list_.end(); ++iter) {
-
     if (deadline_ == VPX_DL_REALTIME && *iter > 0)
       EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
         << *iter;

diff --git a/test/md5_helper.h b/test/md5_helper.h
index f34054d..289f608 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_MD5_HELPER_H_
-#define LIBVPX_TEST_MD5_HELPER_H_
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_
 
 extern "C" {
 #include "./md5_utils.h"
@@ -67,4 +67,4 @@
 
 }  // namespace libvpx_test
 
-#endif  // LIBVPX_TEST_MD5_HELPER_H_
+#endif  // TEST_MD5_HELPER_H_

diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 79896fe..e5ac9db 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc

@@ -11,8 +11,8 @@
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 }
@@ -63,7 +63,8 @@
   // Pointers to top-left pixel of block in the input and output images.
   uint8_t *const src_image_ptr = src_image + (input_stride << 1);
   uint8_t *const dst_image_ptr = dst_image + 8;
-  uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
   (void)vpx_memset(flimits, 255, block_width);
 
   // Initialize pixels in the input:

diff --git a/test/register_state_check.h b/test/register_state_check.h
index fb3f53b..479a42d 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_
-#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_
 
 #ifdef _WIN64
 
@@ -92,4 +92,4 @@
 
 #endif  // _WIN64
 
-#endif  // LIBVPX_TEST_REGISTER_STATE_CHECK_H_
+#endif  // TEST_REGISTER_STATE_CHECK_H_

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 23403e8..453b3a8 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -17,7 +17,6 @@
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-//#include "vp8/common/blockd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 #include "./vp9_rtcd.h"

diff --git a/test/set_roi.cc b/test/set_roi.cc
index 3b6112e..9d2e771 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc

@@ -17,15 +17,19 @@
 #include <sys/types.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 extern "C" {
 #include "vp8/encoder/onyx_int.h"
 }
 
+using libvpx_test::ACMRandom;
+
 namespace {
 
 TEST(Vp8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
   int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
   int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
   unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
@@ -121,10 +125,10 @@
     for (int i = 0; i < 1000; ++i) {
       int rand_deltas[4];
       int deltas_valid;
-      rand_deltas[0] = (rand() % 160) - 80;
-      rand_deltas[1] = (rand() % 160) - 80;
-      rand_deltas[2] = (rand() % 160) - 80;
-      rand_deltas[3] = (rand() % 160) - 80;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;
 
       deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
                       (abs(rand_deltas[1]) <= 63) &&

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 574bfbf..d1f2729 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -13,8 +13,8 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
@@ -51,7 +51,7 @@
   bd.predictor = reinterpret_cast<unsigned char*>(
       vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
 
-  for(int i = 0; kSrcStride[i] > 0; ++i) {
+  for (int i = 0; kSrcStride[i] > 0; ++i) {
     // start at block0
     be.src = 0;
     be.base_src = &source;

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 579d7e2..370ffc1 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -522,3 +522,5 @@
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7  vp90-2-03-size-226x226.webm.md5
 495256cfd123fe777b2c0406862ed8468a1f4677  vp91-2-04-yv444.webm
 65e3a7ffef61ab340d9140f335ecc49125970c2c  vp91-2-04-yv444.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c  vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678  vp90-2-03-deltaq.webm.md5

diff --git a/test/test.mk b/test/test.mk
index 2042c86..4eb599d 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -629,5 +629,7 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5

diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 5610c26..a4dbca4 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc

@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <string>
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #if ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
@@ -48,7 +48,9 @@
 #endif
 
 #if !CONFIG_SHARED
-  /* Shared library builds don't support whitebox tests that exercise internal symbols. */
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
 #if CONFIG_VP8
   vp8_rtcd();
 #endif

diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 4cd356d..9bd03b9 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc

@@ -159,7 +159,7 @@
   "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
 #if CONFIG_NON420
   "vp91-2-04-yv444.webm"
 #endif

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 4a3bf5c..ca53ffb 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -16,16 +16,16 @@
 #include "test/register_state_check.h"
 
 #include "vpx/vpx_integer.h"
-#include "vpx_config.h"
+#include "./vpx_config.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 #if CONFIG_VP8_ENCODER
 # include "vp8/common/variance.h"
-# include "vp8_rtcd.h"
+# include "./vp8_rtcd.h"
 #endif
 #if CONFIG_VP9_ENCODER
 # include "vp9/encoder/vp9_variance.h"
-# include "vp9_rtcd.h"
+# include "./vp9_rtcd.h"
 #endif
 }
 #include "test/acm_random.h"
@@ -107,8 +107,8 @@
 }
 
 template<typename VarianceFunctionType>
-class VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
  public:
   virtual void SetUp() {
     const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
@@ -191,9 +191,9 @@
 }
 
 template<typename SubpelVarianceFunctionType>
-class SubpelVarianceTest :
-    public ::testing::TestWithParam<tuple<int, int,
-                                          SubpelVarianceFunctionType> > {
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType> > {
  public:
   virtual void SetUp() {
     const tuple<int, int, SubpelVarianceFunctionType>& params =

diff --git a/test/vp8_boolcoder_test.cc b/test/vp8_boolcoder_test.cc
index c3a8d12..0383af2 100644
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc

@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
-}
 
 #include <math.h>
 #include <stddef.h>
@@ -24,6 +20,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 
+extern "C" {
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/decoder/dboolhuff.h"
+}
+
 namespace {
 const int num_tests = 10;
 
@@ -44,7 +45,7 @@
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                            uint8_t *output, int count) {
-  int offset = input - (uint8_t *)decrypt_state;
+  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
   for (int i = 0; i < count; i++) {
     output[i] = input[i] ^ secret_key[(offset + i) & 15];
   }
@@ -58,10 +59,10 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int n = 0; n < num_tests; ++n) {
     for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
 
-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
         const int parity = i & 1;
         probas[i] =
             (method == 0) ? 0 : (method == 1) ? 255 :
@@ -76,14 +77,14 @@
       }
       for (int bit_method = 0; bit_method <= 3; ++bit_method) {
         const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
         BOOL_CODER bw;
-        uint8_t bw_buffer[buffer_size];
-        vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size);
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
@@ -98,19 +99,20 @@
 #if CONFIG_DECRYPT
         encrypt_buffer(bw_buffer, buffer_size);
         vp8dx_start_decode(&br, bw_buffer, buffer_size,
-                           test_decrypt_cb, (void *)bw_buffer);
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
 #else
-        vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL);
 #endif
         bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
           GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
-              << "pos: "<< i << " / " << bits_to_test
+              << "pos: "<< i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;
         }

diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
index d850f00..b092509 100644
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc

@@ -26,7 +26,8 @@
   0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };
 
-void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst,
+                    int size, int offset = 0) {
   for (int i = 0; i < size; ++i) {
     dst[i] = src[i] ^ test_key[(offset + i) & 15];
   }
@@ -34,10 +35,11 @@
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                      uint8_t *output, int count) {
-  encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state);
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
 }
 
-} // namespace
+}  // namespace
 
 namespace libvpx_test {
 

diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 3c60011..c823436 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc

@@ -18,7 +18,7 @@
 
 
 extern "C" {
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 }
 
 #include "test/acm_random.h"

diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index 42b2229..5edde90 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc

@@ -19,7 +19,7 @@
 #include "vp9/decoder/vp9_dboolhuff.h"
 }
 
-#include "acm_random.h"
+#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
@@ -32,10 +32,10 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int n = 0; n < num_tests; ++n) {
     for (int method = 0; method <= 7; ++method) {   // we generate various proba
-      const int bits_to_test = 1000;
-      uint8_t probas[bits_to_test];
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
 
-      for (int i = 0; i < bits_to_test; ++i) {
+      for (int i = 0; i < kBitsToTest; ++i) {
         const int parity = i & 1;
         probas[i] =
           (method == 0) ? 0 : (method == 1) ? 255 :
@@ -50,14 +50,14 @@
       }
       for (int bit_method = 0; bit_method <= 3; ++bit_method) {
         const int random_seed = 6432;
-        const int buffer_size = 10000;
+        const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
         vp9_writer bw;
-        uint8_t bw_buffer[buffer_size];
+        uint8_t bw_buffer[kBufferSize];
         vp9_start_encode(&bw, bw_buffer);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
@@ -72,16 +72,16 @@
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
         vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, buffer_size);
+        vp9_reader_init(&br, bw_buffer, kBufferSize);
         bit_rnd.Reset(random_seed);
-        for (int i = 0; i < bits_to_test; ++i) {
+        for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
             bit = (i & 1);
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
           GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
-              << "pos: " << i << " / " << bits_to_test
+              << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;
         }

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 766b4ea..30c4cbb 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h

@@ -41,7 +41,8 @@
     {
         USAGE_STREAM_FROM_SERVER    = 0x0,
         USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-        USAGE_CONSTRAINED_QUALITY   = 0x2
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
     } END_USAGE;
 
 

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 95baf9b..19e9d270 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -153,7 +153,7 @@
 #else
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
     RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
     RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -204,7 +204,7 @@
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
     RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-    if(finalize && cfg->rc_end_usage == VPX_CQ)
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
         RANGE_CHECK(vp8_cfg, cq_level,
                     cfg->rc_min_quantizer, cfg->rc_max_quantizer);
 
@@ -327,17 +327,14 @@
     oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
     oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;
 
-    if (cfg.rc_end_usage == VPX_VBR)
-    {
-        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
-    }
-    else if (cfg.rc_end_usage == VPX_CBR)
-    {
-        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
-    }
-    else if (cfg.rc_end_usage == VPX_CQ)
-    {
-        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
     }
 
     oxcf->target_bandwidth         = cfg.rc_target_bitrate;

diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index 02978ea..a744f59 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm

@@ -25,151 +25,149 @@
     ; stage 1
     vdup.16         d0, r3                    ; duplicate cospi_28_64
     vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
 
     ; input[1] * cospi_28_64
     vmull.s16       q2, d18, d0
     vmull.s16       q3, d19, d0
 
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
     ; input[1]*cospi_28_64-input[7]*cospi_4_64
     vmlsl.s16       q2, d30, d1
     vmlsl.s16       q3, d31, d1
 
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d8, q2, #14               ; >> 14
     vqrshrn.s32     d9, q3, #14               ; >> 14
 
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
     ; input[1] * cospi_4_64
     vmull.s16       q2, d18, d1
     vmull.s16       q3, d19, d1
 
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
     ; input[1]*cospi_4_64+input[7]*cospi_28_64
     vmlal.s16       q2, d30, d0
     vmlal.s16       q3, d31, d0
 
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d14, q2, #14              ; >> 14
     vqrshrn.s32     d15, q3, #14              ; >> 14
 
-    vdup.16         d0, r5                    ; duplicate cospi_12_64
-    vdup.16         d1, r6                    ; duplicate cospi_20_64
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q2, d26, d0
-    vmull.s16       q3, d27, d0
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q2, d22, d1
-    vmlsl.s16       q3, d23, d1
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q2, d26, d1
-    vmull.s16       q3, d27, d1
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q2, d22, d0
-    vmlal.s16       q3, d23, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q3, #14              ; >> 14
-
     ; stage 2 & stage 3 - even half
     vdup.16         d0, r7                    ; duplicate cospi_16_64
 
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
     ; input[0] * cospi_16_64
     vmull.s16       q2, d16, d0
     vmull.s16       q3, d17, d0
 
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
     ; (input[0] + input[2]) * cospi_16_64
     vmlal.s16       q2,  d24, d0
     vmlal.s16       q3, d25, d0
 
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d18, q2, #14              ; >> 14
     vqrshrn.s32     d19, q3, #14              ; >> 14
 
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q2, d24, d0
-    vmlsl.s16       q3, d25, d0
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q2, #14              ; >> 14
-    vqrshrn.s32     d23, q3, #14              ; >> 14
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
     ; input[1] * cospi_24_64
     vmull.s16       q2, d20, d0
     vmull.s16       q3, d21, d0
 
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
     vmlsl.s16       q2, d28, d1
     vmlsl.s16       q3, d29, d1
 
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d26, q2, #14              ; >> 14
     vqrshrn.s32     d27, q3, #14              ; >> 14
 
-    ; input[1] * cospi_8_64
-    vmull.s16       q2, d20, d1
-    vmull.s16       q3, d21, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q2, d28, d0
-    vmlal.s16       q3, d29, d0
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q2, #14              ; >> 14
-    vqrshrn.s32     d31, q3, #14              ; >> 14
-
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
 
     vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
     vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
     vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
     vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
 
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
     ; stage 2 - odd half
     vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
     vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
     vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
     vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
 
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
     ; step2[6] * cospi_16_64
     vmull.s16       q9, d28, d16
     vmull.s16       q10, d29, d16
 
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
     ; (step2[6] - step2[5]) * cospi_16_64
     vmlsl.s16       q9, d26, d16
     vmlsl.s16       q10, d27, d16
 
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d10, q9, #14              ; >> 14
     vqrshrn.s32     d11, q10, #14             ; >> 14
 
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q9, d26, d16
-    vmlal.s16       q10, d27, d16
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
 
     ; stage 4
     vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
@@ -424,25 +422,25 @@
     vmull.s16       q9, d28, d16
     vmull.s16       q10, d29, d16
 
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
     ; (step2[6] - step2[5]) * cospi_16_64
     vmlsl.s16       q9, d26, d16
     vmlsl.s16       q10, d27, d16
 
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
     ; dct_const_round_shift(input_dc * cospi_16_64)
     vqrshrn.s32     d10, q9, #14              ; >> 14
     vqrshrn.s32     d11, q10, #14             ; >> 14
 
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q9, d26, d16
-    vmlal.s16       q10, d27, d16
-
     ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
 
     ; stage 4
     vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];

diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
new file mode 100644
index 0000000..963ef35
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm

@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht4x4_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
+    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
+    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
+    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
+    ; them as buffer during calculation.
+    MACRO
+    IDCT4x4_1D
+    ; stage 1
+    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
+    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
+
+    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
+    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
+    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
+    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q10, #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16    q8,  q13, q14
+    vsub.s16    q9,  q13, q14
+    vswp        d18, d19
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
+    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
+    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
+    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
+    ; q14,q15 registers and use them as buffer during calculation.
+    MACRO
+    IADST4x4_1D
+    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
+    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
+    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
+    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
+    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
+    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
+    vaddw.s16   q15, q15, d19   ; x0 + x3
+    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
+    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
+    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
+
+    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
+    vadd.s32    q10, q10, q8
+    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
+    vdup.32     q8, r0          ; duplicate sinpi_3_9
+    vsub.s32    q11, q11, q9
+    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
+
+    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
+    vadd.s32    q10, q10, q11   ; x0 + x1
+    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
+    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d16, q13, #14
+    vqrshrn.s32 d17, q14, #14
+    vqrshrn.s32 d18, q15, #14
+    vqrshrn.s32 d19, q10, #14
+    MEND
+
+    ; Generate cosine constants in d6 - d8 for the IDCT
+    MACRO
+    GENERATE_COSINE_CONSTANTS
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov         r0, #0x3b00
+    add         r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov         r3, #0x2d00
+    add         r3, #0x41
+    ; cospi_24_64 = 6270 = 0x187e
+    mov         r12, #0x1800
+    add         r12, #0x7e
+
+    ; generate constant vectors
+    vdup.16     d0, r0          ; duplicate cospi_8_64
+    vdup.16     d1, r3          ; duplicate cospi_16_64
+    vdup.16     d2, r12         ; duplicate cospi_24_64
+    MEND
+
+    ; Generate sine constants in d1 - d4 for the IADST.
+    MACRO
+    GENERATE_SINE_CONSTANTS
+    ; sinpi_1_9 = 5283 = 0x14A3
+    mov         r0, #0x1400
+    add         r0, #0xa3
+    ; sinpi_2_9 = 9929 = 0x26C9
+    mov         r3, #0x2600
+    add         r3, #0xc9
+    ; sinpi_4_9 = 15212 = 0x3B6C
+    mov         r12, #0x3b00
+    add         r12, #0x6c
+
+    ; generate constant vectors
+    vdup.16     d3, r0          ; duplicate sinpi_1_9
+
+    ; sinpi_3_9 = 13377 = 0x3441
+    mov         r0, #0x3400
+    add         r0, #0x41
+
+    vdup.16     d4, r3          ; duplicate sinpi_2_9
+    vdup.16     d5, r12         ; duplicate sinpi_4_9
+    vdup.16     q3, r0          ; duplicate sinpi_3_9
+    MEND
+
+    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
+    MACRO
+    TRANSPOSE4X4
+    vtrn.16     d16, d17
+    vtrn.16     d18, d19
+    vtrn.32     q8, q9
+    MEND
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht4x4_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16    {q8,q9}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE4X4
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IDCT4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+idct_iadst
+    ; generate constants
+    GENERATE_COSINE_CONSTANTS
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IDCT4x4_1D
+
+    b end_vp9_short_iht4x4_add_neon
+
+iadst_iadst
+    ; generate constants
+    GENERATE_SINE_CONSTANTS
+
+    ; first transform rows
+    IADST4x4_1D
+
+    ; transpose the matrix
+    TRANSPOSE4X4
+
+    ; then transform columns
+    IADST4x4_1D
+
+end_vp9_short_iht4x4_add_neon
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16   q8, q8, #4
+    vrshr.s16   q9, q9, #4
+
+    vld1.32     {d26[0]}, [r1], r2
+    vld1.32     {d26[1]}, [r1], r2
+    vld1.32     {d27[0]}, [r1], r2
+    vld1.32     {d27[1]}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8    q8, q8, d26
+    vaddw.u8    q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb         r2, r2, #0
+    vst1.32     {d27[1]}, [r1], r2
+    vst1.32     {d27[0]}, [r1], r2
+    vst1.32     {d26[1]}, [r1], r2
+    vst1.32     {d26[0]}, [r1]  ; no post-increment
+    bx          lr
+    ENDP  ; |vp9_short_iht4x4_add_neon|
+
+    END

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 699b44a..f138c09 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -341,7 +341,7 @@
                                ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                                const int16_t **scan,
                                const uint8_t **band_translate) {
-  ENTROPY_CONTEXT above_ec, left_ec;
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
     case TX_4X4:

diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 6cfc346..2e973e5 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c

@@ -79,20 +79,59 @@
 
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
 
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   MV_CLASS_TYPE c = MV_CLASS_0;
-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;
-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;
-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;
-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;
-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;
-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;
-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;
-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
-  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
-  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
-  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
-  else assert(0);
+  if (z >= CLASS0_SIZE * 4096)
+    c = MV_CLASS_10;
+  else
+    c = log_in_base_2[z >> 3];
+
   if (offset)
     *offset = z - mv_class_base(c);
   return c;
@@ -110,8 +149,6 @@
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
-  if (!incr)
-    return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
   comp_counts->sign[s] += incr;
@@ -123,61 +160,39 @@
   d = (o >> 3);               /* int mv data */
   f = (o >> 1) & 3;           /* fractional pel mv data */
   e = (o & 1);                /* high precision mv data */
+
   if (c == MV_CLASS_0) {
     comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
       comp_counts->bits[i][((d >> i) & 1)] += incr;
-  }
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    comp_counts->class0_fp[d][f] += incr;
-  } else {
     comp_counts->fp[f] += incr;
-  }
-
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      comp_counts->class0_hp[e] += incr;
-    } else {
-      comp_counts->hp[e] += incr;
-    }
+    comp_counts->hp[e] += usehp * incr;
   }
 }
 
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
-  int v;
-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
-  for (v = 1; v <= MV_MAX; v++) {
-    inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
-    inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
-  }
-}
 
 void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   ++counts->joints[j];
 
-  if (mv_joint_vertical(j))
-    ++counts->comps[0].mvcount[MV_MAX + mv->row];
+  if (mv_joint_vertical(j)) {
+    inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+  }
 
-  if (mv_joint_horizontal(j))
-    ++counts->comps[1].mvcount[MV_MAX + mv->col];
+  if (mv_joint_horizontal(j)) {
+    inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+  }
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
   return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
-  counts_to_context(&nmv_count->comps[0], usehp);
-  counts_to_context(&nmv_count->comps[1], usehp);
-}
-
 static unsigned int adapt_probs(unsigned int i,
                                 vp9_tree tree,
                                 vp9_prob this_probs[],
@@ -207,8 +222,6 @@
   nmv_context *pre_ctx = &pre_fc->nmvc;
   nmv_context_counts *cts = &cm->counts.mv;
 
-  vp9_counts_process(cts, allow_hp);
-
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {

diff --git a/vp9/common/vp9_extend.c b/vp9/common/vp9_extend.c
index d8496c4..07c68c8 100644
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c

@@ -57,15 +57,23 @@
 
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst) {
-  const int et_y = dst->border;
-  const int el_y = dst->border;
-  const int eb_y = dst->border + dst->y_height - src->y_height;
-  const int er_y = dst->border + dst->y_width - src->y_width;
-
-  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
-  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
-  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
-  const int er_uv = el_uv + dst->uv_width - src->uv_width;
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // or up to 16, whichever is greater.
+  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
+                       16);
+  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
+                       16);
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
 
 #if CONFIG_ALPHA
   const int et_a = dst->border >> (dst->alpha_height != dst->y_height);

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index cfa61c2..0d883ab 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -603,11 +603,15 @@
     case BLOCK_64X32:
       build_masks(lfi_n, mip, 0, 0, lfm);
       mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows)
+        break;
       build_masks(lfi_n, mip2 , 32, 8, lfm);
       break;
     case BLOCK_32X64:
       build_masks(lfi_n, mip, 0, 0, lfm);
       mip2 = mip + 4;
+      if (4 >= max_cols)
+        break;
       build_masks(lfi_n, mip2, 4, 2, lfm);
       break;
     default:
@@ -624,11 +628,15 @@
             break;
           case BLOCK_32X16:
             build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            if (mi_32_row_offset + 2 >= max_rows)
+              continue;
             mip2 = mip + mode_info_stride * 2;
             build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
             break;
           case BLOCK_16X32:
             build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            if (mi_32_col_offset + 2 >= max_cols)
+              continue;
             mip2 = mip + 2;
             build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
             break;
@@ -650,11 +658,15 @@
                   break;
                 case BLOCK_16X8:
                   build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  if (mi_16_row_offset + 1 >= max_rows)
+                    continue;
                   mip2 = mip + mode_info_stride;
                   build_y_mask(lfi_n, mip2, shift_y+8, lfm);
                   break;
                 case BLOCK_8X16:
                   build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  if (mi_16_col_offset +1 >= max_cols)
+                    continue;
                   mip2 = mip + 1;
                   build_y_mask(lfi_n, mip2, shift_y+1, lfm);
                   break;
@@ -777,6 +789,7 @@
     }
   }
 }
+#if CONFIG_NON420
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       const MODE_INFO *mi,
@@ -896,6 +909,7 @@
     dst->buf += 8 * dst->stride;
   }
 }
+#endif
 
 static void filter_block_plane(VP9_COMMON *const cm,
                                struct macroblockd_plane *const plane,
@@ -981,8 +995,10 @@
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
   int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
       xd->plane[1].subsampling_x == 1);
+#endif
 
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -993,16 +1009,22 @@
       setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
       if (use_420)
+#endif
         setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
+#if CONFIG_NON420
         if (use_420)
+#endif
           filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
                              &lfm);
+#if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
                                     mi_row, mi_col);
+#endif
       }
     }
   }

diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index b237f9e..f424e6a 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h

@@ -46,7 +46,8 @@
   typedef enum {
     USAGE_STREAM_FROM_SERVER    = 0x0,
     USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
-    USAGE_CONSTRAINED_QUALITY   = 0x2
+    USAGE_CONSTRAINED_QUALITY   = 0x2,
+    USAGE_CONSTANT_QUALITY      = 0x3,
   } END_USAGE;
 
 

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a669cc5..f0bc063 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -20,7 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 
@@ -201,7 +201,7 @@
   unsigned int current_video_frame;
   int version;
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   struct postproc_state  postproc_state;
 #endif
 

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 48d3d2d..955e676 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -1011,7 +1011,8 @@
   /* handle problem with extending borders */
   dest->y_width = cm->width;
   dest->y_height = cm->height;
-  dest->uv_height = dest->y_height / 2;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;
 
   return 0;
 }

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 104db6a..f5eeb2c 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -237,7 +237,7 @@
 #
 # post proc
 #
-if [ "$CONFIG_POSTPROC" = "yes" ]; then
+if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then
 prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
@@ -325,7 +325,7 @@
 specialize vp9_short_idct1_32x32
 
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2
+specialize vp9_short_iht4x4_add sse2 neon
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add sse2
@@ -701,7 +701,7 @@
 specialize vp9_quantize_b $ssse3_x86_64
 
 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
 
 #
 # Structured Similarity (SSIM)

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8dfb22c..7f23dc1 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -460,6 +460,7 @@
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
+    assert(bsize >= BLOCK_8X8);
   } else {
     if (bsize >= BLOCK_8X8)
       mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 41e406d..6cb7c09 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -77,14 +77,11 @@
         vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
   int i;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  xd->q_index = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-
-  xd->plane[0].dequant = cm->y_dequant[xd->q_index];
+  xd->plane[0].dequant = cm->y_dequant[q_index];
   for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
+    xd->plane[i].dequant = cm->uv_dequant[q_index];
 }
 
 static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -149,14 +146,17 @@
 }
 
 static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  if (xd->mode_info_context->mbmi.skip_coeff) {
-      reset_skip_context(xd, bsize);
+  if (mbmi->skip_coeff) {
+    reset_skip_context(xd, bsize);
     return -1;
   } else {
-    if (pbi->common.seg.enabled)
-      init_dequantizer(&pbi->common, xd);
+    if (cm->seg.enabled)
+      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                                  cm->base_qindex));
 
     // TODO(dkovalev) if (!vp9_reader_has_error(r))
     return vp9_decode_tokens(pbi, r, bsize);
@@ -173,6 +173,7 @@
 
   xd->mode_info_context = cm->mi + offset;
   xd->mode_info_context->mbmi.sb_type = bsize;
+  xd->mode_info_stride = cm->mode_info_stride;
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
   xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + offset : NULL;
@@ -453,8 +454,7 @@
 
 static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
   const int old = *delta_q;
-  if (vp9_rb_read_bit(rb))
-    *delta_q = vp9_rb_read_signed_literal(rb, 4);
+  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
   return old != *delta_q;
 }
 
@@ -958,11 +958,7 @@
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  xd->mode_info_context = cm->mi;
-  xd->prev_mode_info_context = cm->prev_mi;
-  xd->mode_info_stride = cm->mode_info_stride;
-
-  init_dequantizer(cm, &pbi->mb);
+  setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
 
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
 

diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index b609f9f..505e9dc 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -13,7 +13,7 @@
 #include <stdio.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/decoder/vp9_onyxd.h"
@@ -421,7 +421,7 @@
   *time_stamp = pbi->last_time_stamp;
   *time_end_stamp = 0;
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
 
@@ -429,7 +429,9 @@
     *sd = *pbi->common.frame_to_show;
     sd->y_width = pbi->common.width;
     sd->y_height = pbi->common.height;
-    sd->uv_height = pbi->common.height / 2;
+    sd->uv_width = sd->y_width >> pbi->common.subsampling_x;
+    sd->uv_height = sd->y_height >> pbi->common.subsampling_y;
+
     ret = 0;
   } else {
     ret = -1;

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eb83903..45758e7 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -851,13 +851,75 @@
   }
 }
 
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if ((rows_left <= 0) || (cols_left <= 0)) {
+    return MIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; --bsize) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
+  int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+  int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
   int block_row, block_col;
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      m[block_row * mis + block_col].mbmi.sb_type = bsize;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; ++block_row) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; ++block_col) {
+        m[block_row * mis + block_col].mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    int bh = num_8x8_blocks_high_lookup[bsize];
+    int bw = num_8x8_blocks_wide_lookup[bsize];
+    int sub_block_row;
+    int sub_block_col;
+    int row_index;
+    int col_index;
+
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        // Find a partition size that fits
+        bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                    (row8x8_remaining - block_row),
+                                    (col8x8_remaining - block_col), &bh, &bw);
+
+        // Set the mi entries for all 8x8 blocks within the selected size
+        for (sub_block_row = 0; sub_block_row < bh; ++sub_block_row) {
+          for (sub_block_col = 0; sub_block_col < bw; ++sub_block_col) {
+            row_index = block_row + sub_block_row;
+            col_index = block_col + sub_block_col;
+            m[row_index * mis + col_index].mbmi.sb_type = bsize;
+          }
+        }
+      }
     }
   }
 }
@@ -1946,7 +2008,7 @@
       cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
         set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+        set_partitioning(cpi, m, mi_row, mi_col);
         rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else if (cpi->sf.partition_by_variance) {

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index da9a3bd..86be85d 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -635,8 +635,8 @@
         vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
       else
         x->fwd_txm16x16(src_diff, coeff, bw * 8);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)

diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 1203c00..9977289 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c

@@ -155,7 +155,6 @@
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
-  vp9_counts_process(nmv_count, usehp);
   vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
                                    prob->joints,
                                    branch_ct_joint,

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 92485f9..0cbe3ab 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -1092,7 +1092,6 @@
   return q;
 }
 
-
 extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
@@ -2079,63 +2078,71 @@
 
   vp9_clear_system_state();
 
-  // Special case code for first frame.
-  if (cpi->common.current_video_frame == 0) {
-    cpi->twopass.est_max_qcorrection_factor = 1.0;
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    cpi->active_worst_quality = cpi->oxcf.cq_level;
+  } else {
+    // Special case code for first frame.
+    if (cpi->common.current_video_frame == 0) {
+      int section_target_bandwidth =
+          (int)(cpi->twopass.bits_left / frames_left);
+      cpi->twopass.est_max_qcorrection_factor = 1.0;
 
-    // Set a cq_level in constrained quality mode.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-                               (int)(cpi->twopass.bits_left / frames_left));
+      // Set a cq_level in constrained quality mode.
+      if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+        int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                                 section_target_bandwidth);
 
-      cpi->cq_target_quality = cpi->oxcf.cq_level;
-      if (est_cq > cpi->cq_target_quality)
-        cpi->cq_target_quality = est_cq;
+        cpi->cq_target_quality = cpi->oxcf.cq_level;
+        if (est_cq > cpi->cq_target_quality)
+          cpi->cq_target_quality = est_cq;
+      }
+
+      // guess at maxq needed in 2nd pass
+      cpi->twopass.maxq_max_limit = cpi->worst_quality;
+      cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+      tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                             section_target_bandwidth);
+
+      cpi->active_worst_quality = tmp_q;
+      cpi->ni_av_qi = tmp_q;
+      cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
+
+#ifndef ONE_SHOT_Q_ESTIMATE
+      // Limit the maxq value returned subsequently.
+      // This increases the risk of overspend or underspend if the initial
+      // estimate for the clip is bad, but helps prevent excessive
+      // variation in Q, especially near the end of a clip
+      // where for example a small overspend may cause Q to crash
+      adjust_maxq_qrange(cpi);
+#endif
     }
 
-    // guess at maxq needed in 2nd pass
-    cpi->twopass.maxq_max_limit = cpi->worst_quality;
-    cpi->twopass.maxq_min_limit = cpi->best_quality;
-
-    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                           (int)(cpi->twopass.bits_left / frames_left));
-
-    cpi->active_worst_quality = tmp_q;
-    cpi->ni_av_qi = tmp_q;
-    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
 #ifndef ONE_SHOT_Q_ESTIMATE
-    // Limit the maxq value returned subsequently.
-    // This increases the risk of overspend or underspend if the initial
-    // estimate for the clip is bad, but helps prevent excessive
-    // variation in Q, especially near the end of a clip
-    // where for example a small overspend may cause Q to crash
-    adjust_maxq_qrange(cpi);
+    // The last few frames of a clip almost always have to few or too many
+    // bits and for the sake of over exact rate control we dont want to make
+    // radical adjustments to the allowed quantizer range just to use up a
+    // few surplus bits or get beneath the target rate.
+    else if ((cpi->common.current_video_frame <
+              (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
+             ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+              (unsigned int)cpi->twopass.total_stats.count)) {
+      int section_target_bandwidth =
+          (int)(cpi->twopass.bits_left / frames_left);
+      if (frames_left < 1)
+        frames_left = 1;
+
+      tmp_q = estimate_max_q(
+          cpi,
+          &cpi->twopass.total_left_stats,
+          section_target_bandwidth);
+
+      // Make a damped adjustment to active max Q
+      cpi->active_worst_quality =
+          adjust_active_maxq(cpi->active_worst_quality, tmp_q);
+    }
 #endif
   }
-
-#ifndef ONE_SHOT_Q_ESTIMATE
-  // The last few frames of a clip almost always have to few or too many
-  // bits and for the sake of over exact rate control we dont want to make
-  // radical adjustments to the allowed quantizer range just to use up a
-  // few surplus bits or get beneath the target rate.
-  else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
-           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats.count)) {
-    if (frames_left < 1)
-      frames_left = 1;
-
-    tmp_q = estimate_max_q(
-              cpi,
-              &cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
-
-    // Make a damped adjustment to active max Q
-    cpi->active_worst_quality =
-      adjust_active_maxq(cpi->active_worst_quality, tmp_q);
-  }
-#endif
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
     return;

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 5f2f2ba..8b0af31 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -28,7 +28,7 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
@@ -898,6 +898,8 @@
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
         sf->disable_filter_search_var_thresh = 96;
+        sf->intra_y_mode_mask = INTRA_DC_ONLY;
+        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
         sf->use_fast_coef_updates = 2;
       }
       /*
@@ -2729,9 +2731,9 @@
 
   if (cm->frame_type == KEY_FRAME) {
 #if !CONFIG_MULTIPLE_ARF
-    // Special case for key frames forced because we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping
+      // Special case for key frames forced because we have reached
+      // the maximum key frame interval. Here force the Q to a range
+      // based on the ambient Q to reduce the risk of popping
     if (cpi->this_key_frame_forced) {
       int delta_qindex;
       int qindex = cpi->last_boosted_qindex;
@@ -2740,7 +2742,8 @@
       delta_qindex = compute_qdelta(cpi, last_boosted_q,
                                     (last_boosted_q * 0.75));
 
-      cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality);
+      cpi->active_best_quality = MAX(qindex + delta_qindex,
+                                     cpi->best_quality);
     } else {
       int high = 5000;
       int low = 400;
@@ -2761,7 +2764,6 @@
         cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
       }
 
-
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
@@ -2770,14 +2772,14 @@
       // Make a further adjustment based on the kf zero motion measure.
       q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
 
-      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
       q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
       cpi->active_best_quality +=
-        compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+          compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
     }
 #else
     double current_q;
-
     // Force the KF quantizer to be 30% of the active_worst_quality.
     current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
     cpi->active_best_quality = cpi->active_worst_quality
@@ -2794,13 +2796,11 @@
         cpi->avg_frame_qindex < cpi->active_worst_quality) {
       q = cpi->avg_frame_qindex;
     }
-
     // For constrained quality dont allow Q less than the cq level
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
         q < cpi->cq_target_quality) {
       q = cpi->cq_target_quality;
     }
-
     if (cpi->gfu_boost > high) {
       cpi->active_best_quality = gf_low_motion_minq[q];
     } else if (cpi->gfu_boost < low) {
@@ -2817,29 +2817,71 @@
     // Constrained quality use slightly lower active best.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
       cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
+
+    // TODO(debargha): Refine the logic below
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        if (cpi->gfu_boost > high) {
+          cpi->active_best_quality = cpi->cq_target_quality * 14 / 16;
+        } else if (cpi->gfu_boost < low) {
+          cpi->active_best_quality = cpi->cq_target_quality;
+        } else {
+          const int gap = high - low;
+          const int offset = high - cpi->gfu_boost;
+          const int qdiff = cpi->cq_target_quality * 2 / 16;
+          const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+          cpi->active_best_quality = cpi->cq_target_quality * 14 / 16
+              + adjustment;
+        }
+      } else {
+        if (cpi->frames_since_key > 1) {
+          if (cpi->gfu_boost > high) {
+            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16;
+          } else if (cpi->gfu_boost < low) {
+            cpi->active_best_quality = cpi->cq_target_quality * 10 / 16;
+          } else {
+            const int gap = high - low;
+            const int offset = high - cpi->gfu_boost;
+            const int qdiff = cpi->cq_target_quality * 4 / 16;
+            const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+            cpi->active_best_quality = cpi->cq_target_quality * 6 / 16
+                + adjustment;
+          }
+        }
+      }
+    }
   } else {
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      cpi->active_best_quality = cpi->cq_target_quality;
+    } else {
 #ifdef ONE_SHOT_Q_ESTIMATE
 #ifdef STRICT_ONE_SHOT_Q
-    cpi->active_best_quality = q;
+      cpi->active_best_quality = q;
 #else
-    cpi->active_best_quality = inter_minq[q];
+      cpi->active_best_quality = inter_minq[q];
 #endif
 #else
-    cpi->active_best_quality = inter_minq[q];
+      cpi->active_best_quality = inter_minq[q];
 #endif
 
-    // For the constant/constrained quality mode we don't want
-    // q to fall below the cq level.
-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-        (cpi->active_best_quality < cpi->cq_target_quality)) {
-      // If we are strongly undershooting the target rate in the last
-      // frames then use the user passed in cq value not the auto
-      // cq value.
-      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
-        cpi->active_best_quality = cpi->oxcf.cq_level;
-      else
-        cpi->active_best_quality = cpi->cq_target_quality;
+      // For the constant/constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+          (cpi->active_best_quality < cpi->cq_target_quality)) {
+        // If we are strongly undershooting the target rate in the last
+        // frames then use the user passed in cq value not the auto
+        // cq value.
+        if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
+          cpi->active_best_quality = cpi->oxcf.cq_level;
+        else
+          cpi->active_best_quality = cpi->cq_target_quality;
+      }
     }
+    /*
+    if (cm->current_video_frame == 1)
+      printf("q/active_best/worst_quality = %d %d %d\n",
+             q, cpi->active_best_quality, cpi->active_worst_quality);
+             */
   }
 
   // Clip the active best and worst quality values to limits
@@ -2856,7 +2898,9 @@
     cpi->active_worst_quality = cpi->active_best_quality;
 
   // Special case code to try and match quality with forced key frames
-  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    q = cpi->active_best_quality;
+  } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
     q = cpi->last_boosted_qindex;
   } else {
     // Determine initial Q to try
@@ -2868,7 +2912,8 @@
 
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) {
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
     double new_q;
     double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
     int level = cpi->this_frame_weight;
@@ -2910,7 +2955,7 @@
     set_mvcost(&cpi->mb);
   }
 
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
@@ -3003,124 +3048,130 @@
     active_worst_qchanged = 0;
 
     // Special case handling for forced key frames
-    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-      int last_q = q;
-      int kf_err = vp9_calc_ss_err(cpi->Source,
-                                   &cm->yv12_fb[cm->new_fb_idx]);
-
-      int high_err_target = cpi->ambient_err;
-      int low_err_target = cpi->ambient_err >> 1;
-
-      // Prevent possible divide by zero error below for perfect KF
-      kf_err += !kf_err;
-
-      // The key frame is not good enough or we can afford
-      // to make it better without undue risk of popping.
-      if ((kf_err > high_err_target &&
-           cpi->projected_frame_size <= frame_over_shoot_limit) ||
-          (kf_err > low_err_target &&
-           cpi->projected_frame_size <= frame_under_shoot_limit)) {
-        // Lower q_high
-        q_high = q > q_low ? q - 1 : q_low;
-
-        // Adjust Q
-        q = (q * high_err_target) / kf_err;
-        q = MIN(q, (q_high + q_low) >> 1);
-      } else if (kf_err < low_err_target &&
-                cpi->projected_frame_size >= frame_under_shoot_limit) {
-        // The key frame is much better than the previous frame
-        // Raise q_low
-        q_low = q < q_high ? q + 1 : q_high;
-
-        // Adjust Q
-        q = (q * low_err_target) / kf_err;
-        q = MIN(q, (q_high + q_low + 1) >> 1);
-      }
-
-      // Clamp Q to upper and lower limits:
-      q = clamp(q, q_low, q_high);
-
-      loop = q != last_q;
-    }
-
-    // Is the projected frame size out of range and are we allowed to attempt to recode.
-    else if (recode_loop_test(cpi,
-                              frame_over_shoot_limit, frame_under_shoot_limit,
-                              q, top_index, bottom_index)) {
-      int last_q = q;
-      int retries = 0;
-
-      // Frame size out of permitted range:
-      // Update correction factor & compute new Q to try...
-
-      // Frame is too large
-      if (cpi->projected_frame_size > cpi->this_frame_target) {
-        // Raise Qlow as to at least the current value
-        q_low = q < q_high ? q + 1 : q_high;
-
-        if (undershoot_seen || loop_count > 1) {
-          // Update rate_correction_factor unless cpi->active_worst_quality
-          // has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
-
-          q = (q_high + q_low + 1) / 2;
-        } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-          while (q < q_low && retries < 10) {
-            vp9_update_rate_correction_factors(cpi, 0);
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            retries++;
-          }
-        }
-
-        overshoot_seen = 1;
-      } else {
-        // Frame is too small
-        q_high = q > q_low ? q - 1 : q_low;
-
-        if (overshoot_seen || loop_count > 1) {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 1);
-
-          q = (q_high + q_low) / 2;
-        } else {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
-          if (!active_worst_qchanged)
-            vp9_update_rate_correction_factors(cpi, 0);
-
-          q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-          // Special case reset for qlow for constrained quality.
-          // This should only trigger where there is very substantial
-          // undershoot on a frame and the auto cq level is above
-          // the user passsed in value.
-          if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-            q_low = q;
-          }
-
-          while (q > q_high && retries < 10) {
-            vp9_update_rate_correction_factors(cpi, 0);
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            retries++;
-          }
-        }
-
-        undershoot_seen = 1;
-      }
-
-      // Clamp Q to upper and lower limits:
-      q = clamp(q, q_low, q_high);
-
-      loop = q != last_q;
-    } else {
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+        int last_q = q;
+        int kf_err = vp9_calc_ss_err(cpi->Source,
+                                     &cm->yv12_fb[cm->new_fb_idx]);
+
+        int high_err_target = cpi->ambient_err;
+        int low_err_target = cpi->ambient_err >> 1;
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             cpi->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             cpi->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (q * high_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   cpi->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (q * low_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          q, top_index, bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (cpi->projected_frame_size > cpi->this_frame_target) {
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 1);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 0);
+
+            q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+            while (q < q_low && retries < 10) {
+              vp9_update_rate_correction_factors(cpi, 0);
+              q = vp9_regulate_q(cpi, cpi->this_frame_target);
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 1);
+
+            q = (q_high + q_low) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            // cpi->active_worst_quality has changed.
+            if (!active_worst_qchanged)
+              vp9_update_rate_correction_factors(cpi, 0);
+
+            q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              vp9_update_rate_correction_factors(cpi, 0);
+              q = vp9_regulate_q(cpi, cpi->this_frame_target);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else {
+        loop = 0;
+      }
     }
 
     if (cpi->is_src_frame_alt_ref)
@@ -3129,10 +3180,10 @@
     if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
       if (mcomp_filter_index < mcomp_filters) {
         int64_t err = vp9_calc_ss_err(cpi->Source,
-                                    &cm->yv12_fb[cm->new_fb_idx]);
+                                      &cm->yv12_fb[cm->new_fb_idx]);
         int64_t rate = cpi->projected_frame_size << 8;
         mcomp_filter_cost[mcomp_filter_index] =
-          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
+            (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
         mcomp_filter_index++;
         if (mcomp_filter_index < mcomp_filters) {
           cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
@@ -3154,10 +3205,10 @@
             cm->mcomp_filter_type = mcomp_best_filter;
           }
           /*
-          printf("  best filter = %d, ( ", mcomp_best_filter);
-          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);
-          printf(")\n");
-          */
+             printf("  best filter = %d, ( ", mcomp_best_filter);
+             for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);
+             printf(")\n");
+             */
         }
 #if RESET_FOREACH_FILTER
         if (loop) {
@@ -3353,9 +3404,9 @@
   // in this frame.
   // update_base_skip_probs(cpi);
 
-#if 0 && CONFIG_INTERNAL_STATS
+#if 0  // CONFIG_INTERNAL_STATS
   {
-    FILE *f = fopen("tmp.stt", "a");
+    FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
     int recon_err;
 
     vp9_clear_system_state();  // __asm emms;
@@ -3364,7 +3415,7 @@
                                 &cm->yv12_fb[cm->new_fb_idx]);
 
     if (cpi->twopass.total_left_stats.coded_error != 0.0)
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
               "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
               "%10.3f %8d %10d %10d %10d\n",
@@ -3374,6 +3425,7 @@
               (int)cpi->total_target_vs_actual,
               (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
               (int)cpi->total_actual_bits,
+              cm->base_qindex,
               vp9_convert_qindex_to_q(cm->base_qindex),
               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
               vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3392,7 +3444,7 @@
               cpi->tot_recode_hits, recon_err, cpi->kf_boost,
               cpi->kf_zeromotion_pct);
     else
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
               "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
               "%8d %10d %10d %10d\n",
@@ -3403,6 +3455,7 @@
               (int)cpi->total_target_vs_actual,
               (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
               (int)cpi->total_actual_bits,
+              cm->base_qindex,
               vp9_convert_qindex_to_q(cm->base_qindex),
               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
               vp9_convert_qindex_to_q(cpi->active_best_quality),
@@ -3973,7 +4026,7 @@
         {
           double frame_psnr2, frame_ssim2 = 0;
           double weight = 0;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
@@ -4049,7 +4102,7 @@
     return -1;
   else {
     int ret;
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
     ret = vp9_post_proc_frame(&cpi->common, dest, flags);
 #else
 
@@ -4063,7 +4116,7 @@
       ret = -1;
     }
 
-#endif // !CONFIG_POSTPROC
+#endif  // !CONFIG_VP9_POSTPROC
     vp9_clear_system_state();
     return ret;
   }

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fb0e470..cfaa776 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -69,6 +69,7 @@
 
       if (x >= zbin) {
         x += (round_ptr[rc != 0]);
+        x  = clamp(x, INT16_MIN, INT16_MAX);
         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
               quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
         x  = (y ^ sz) - sz;                         // get the sign back
@@ -135,6 +136,7 @@
 
       if (x >= zbin) {
         x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        x  = clamp(x, INT16_MIN, INT16_MAX);
         y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
               quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
 

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6c67d55..41e43fe 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -2917,8 +2917,8 @@
                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
               x->skip = 1;
 
-              *rate2 = 500;
-              *rate_uv = 0;
+              // The cost of skip bit needs to be added.
+              *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
 
               // Scaling factor for SSE from spatial domain to frequency domain
               // is 16. Adjust distortion accordingly.

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 7deb981..db30660 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm

@@ -70,9 +70,9 @@
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddw                           m6, m1                   ; m6 += round
+  paddsw                          m6, m1                   ; m6 += round
   punpckhqdq                      m1, m1
-  paddw                          m11, m1                   ; m11 += round
+  paddsw                         m11, m1                   ; m11 += round
   pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
   punpckhqdq                      m2, m2
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
@@ -127,8 +127,8 @@
   or                              r6, r2
   jz .skip_iter
 %endif
-  paddw                           m6, m1                   ; m6 += round
-  paddw                          m11, m1                   ; m11 += round
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index fb302ab..c6daecc 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -73,11 +73,11 @@
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
-VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
@@ -102,6 +102,7 @@
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 3198394..f44cd27 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -148,7 +148,7 @@
 
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
-  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
@@ -264,13 +264,15 @@
   // VBR only supported for now.
   // CBR code has been deprectated for experimental phase.
   // CQ mode not yet tested
-  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;
-  /*if (cfg.rc_end_usage == VPX_CQ)
-      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
-  else
-      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/
+  oxcf->end_usage        = USAGE_LOCAL_FILE_PLAYBACK;
+  /*
+  if (cfg.rc_end_usage == VPX_CQ)
+    oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;
+    */
+  if (cfg.rc_end_usage == VPX_Q)
+    oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
 
-  oxcf->target_bandwidth       = cfg.rc_target_bitrate;
+  oxcf->target_bandwidth        = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
@@ -894,7 +896,7 @@
 static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
   (void)ctr_id;
 

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index e7362fc..10b3238 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -20,7 +20,7 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/vp9_iface_common.h"
 
-#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 typedef vpx_codec_stream_info_t  vp9_stream_info_t;
 
 /* Structures for handling memory allocations */
@@ -596,7 +596,7 @@
 static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
                                     int ctr_id,
                                     va_list args) {
-#if CONFIG_POSTPROC
+#if CONFIG_VP9_POSTPROC
   vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
 
   if (data) {

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 89de601..9fbf100 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -64,7 +64,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance_c.c
-ifeq ($(CONFIG_POSTPROC),yes)
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 181b92a..56fd2d9 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -223,9 +223,10 @@
 
   /*!\brief Rate control mode */
   enum vpx_rc_mode {
-    VPX_VBR, /**< Variable Bit Rate (VBR) mode */
+    VPX_VBR,  /**< Variable Bit Rate (VBR) mode */
     VPX_CBR,  /**< Constant Bit Rate (CBR) mode */
-    VPX_CQ   /**< Constant Quality  (CQ)  mode */
+    VPX_CQ,   /**< Constrained Quality (CQ)  mode */
+    VPX_Q,    /**< Constant Quality (Q) mode */
   };
 
 

diff --git a/vpxenc.c b/vpxenc.c
index 5d29c15..0c742ca 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -1046,6 +1046,7 @@
   {"vbr", VPX_VBR},
   {"cbr", VPX_CBR},
   {"cq",  VPX_CQ},
+  {"q",   VPX_Q},
   {NULL, 0}
 };
 static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
@@ -1126,7 +1127,7 @@
 static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1,
                                                 "Material to favor", tuning_enum);
 static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1,
-                                          "Constrained Quality Level");
+                                          "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,
                                                     "Max I-frame bitrate (pct)");
 static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");