Merge "Indent build/make/configure.sh"
diff --git a/test/test.mk b/test/test.mk
index ab4ebbf..c665ae2 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -137,6 +137,8 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
new file mode 100644
index 0000000..b59d95e
--- /dev/null
+++ b/test/vp9_error_block_test.cc
@@ -0,0 +1,150 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int kNumIterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  intptr_t block_size,
+                                  int64_t *ssz, int bps);
+
+typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
+                        ErrorBlockParam;
+
+class ErrorBlockTest
+  : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_     = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      coeff[j]   = rnd(2 << 20) - (1 << 20);
+      dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff,   4096);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  int max_val = ((1 << 20) - 1);
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 5;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if ( k == 4 && (i % 9) == 0 ) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {  // Test at maximum values
+        coeff[j]   = k % 2 ? max_val : -max_val;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+      } else {
+        coeff[j]   = rnd(2 << 14) - (1 << 14);
+        dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_10),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_12),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
new file mode 100644
index 0000000..c30b827
--- /dev/null
+++ b/test/vp9_quantize_test.cc
@@ -0,0 +1,357 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 100;
+
+typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
+                             int skip_block, const int16_t *zbin,
+                             const int16_t *round, const int16_t *quant,
+                             const int16_t *quant_shift,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant, int zbin_oq_value,
+                             uint16_t *eob, const int16_t *scan,
+                             const int16_t *iscan);
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+    QuantizeParam;
+
+class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9QuantizeTest() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9Quantize32Test() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+TEST_P(VP9QuantizeTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = TX_32X32;
+    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9QuantizeTest, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    // Two random entries
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int zbin_oq_value = 0;
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    // Two random entries
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr,
+                                          zbin_oq_value, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_quantize_b_sse2,
+                   &vp9_highbd_quantize_b_c, VPX_BITS_8),
+        make_tuple(&vp9_highbd_quantize_b_sse2,
+                   &vp9_highbd_quantize_b_c, VPX_BITS_10),
+        make_tuple(&vp9_highbd_quantize_b_sse2,
+                   &vp9_highbd_quantize_b_c, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9Quantize32Test,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+                   &vp9_highbd_quantize_b_32x32_c, VPX_BITS_8),
+        make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+                   &vp9_highbd_quantize_b_32x32_c, VPX_BITS_10),
+        make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
+                   &vp9_highbd_quantize_b_32x32_c, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index cc35476..902a6fc 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -207,7 +207,7 @@
 int Sync(VP9Worker *const worker) { return !worker->had_error; }
 
 void Execute(VP9Worker *const worker) {
-  worker->had_error |= worker->hook(worker->data1, worker->data2);
+  worker->had_error |= !worker->hook(worker->data1, worker->data2);
 }
 
 void Launch(VP9Worker *const worker) { Execute(worker); }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 20f1535..510f9d8 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1855,7 +1855,7 @@
   # ENCODEMB INVOKE
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error/;
+  specialize qw/vp9_highbd_block_error sse2/;
 
   add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/vp9_highbd_subtract_block/;
@@ -1867,10 +1867,10 @@
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
   add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b/;
+  specialize qw/vp9_highbd_quantize_b sse2/;
 
   add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32/;
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
 
   #
   # Structured Similarity (SSIM)
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index f7fca0c..3762c53 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -11,8 +11,9 @@
 #include <limits.h>
 #include <math.h>
 
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/common/vp9_seg_common.h"
-
 #include "vp9/encoder/vp9_segmentation.h"
 
 #define AQ_C_SEGMENTS  3
@@ -22,6 +23,7 @@
   {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
 static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
   {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+static const double aq_c_var_thresholds[AQ_C_SEGMENTS] = {100.0, 12.0, 10.0};
 
 static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
@@ -94,7 +96,7 @@
 // An "aq_strength" value determines how many segments are supported,
 // the set of transition points to use and the extent of the quantizer
 // adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
+void vp9_select_in_frame_q_segment(VP9_COMP *cpi, BLOCK_SIZE bs,
                                    int mi_row, int mi_col,
                                    int output_enabled, int projected_rate) {
   VP9_COMMON *const cm = &cpi->common;
@@ -118,6 +120,10 @@
                             (bw * bh);
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
     const int active_segments = aq_c_active_segments[aq_strength];
+    double logvar;
+
+    vp9_setup_src_planes(&cpi->mb, cpi->Source, mi_row, mi_col);
+    logvar = vp9_log_block_var(cpi, &cpi->mb, bs);
 
     // The number of segments considered and the transition points used to
     // select them is determined by the "aq_strength" value.
@@ -127,8 +133,9 @@
     // with no Q adjustment.
     segment = active_segments - 1;
     while (segment > 0) {
-      if (projected_rate <
-          (target_rate * aq_c_transitions[aq_strength][segment])) {
+      if ((projected_rate <
+          target_rate * aq_c_transitions[aq_strength][segment]) &&
+          (logvar < aq_c_var_thresholds[segment])) {
         break;
       }
       --segment;
diff --git a/vp9/encoder/vp9_aq_complexity.h b/vp9/encoder/vp9_aq_complexity.h
index af031a4..6f82aac 100644
--- a/vp9/encoder/vp9_aq_complexity.h
+++ b/vp9/encoder/vp9_aq_complexity.h
@@ -19,10 +19,10 @@
 struct VP9_COMP;
 
 // Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
+void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, BLOCK_SIZE bs,
+                                   int mi_row, int mi_col,
                                    int output_enabled, int projected_rate);
 
-
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.
 void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 7ee9662..144936d 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -25,12 +25,10 @@
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
-static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+static const double rate_ratio[MAX_SEGMENTS] =
+  {1.143, 1.0, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 2};
 
-#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
-#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
@@ -40,39 +38,12 @@
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
-
   return SEGMENT_ID(energy);
 }
 
-double vp9_vaq_rdmult_ratio(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-
-  vp9_clear_system_state();
-
-  return RDMULT_RATIO(energy);
-}
-
-void vp9_vaq_init() {
-  int i;
-  double base_ratio;
-
-  assert(ENERGY_SPAN <= MAX_SEGMENTS);
-
-  vp9_clear_system_state();
-
-  base_ratio = 1.5;
-
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    Q_RATIO(i) = pow(base_ratio, i/3.0);
-  }
-}
-
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
-  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                              cm->y_dc_delta_q);
   int i;
 
   if (cm->frame_type == KEY_FRAME ||
@@ -83,26 +54,28 @@
 
     seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();
+    vp9_clear_system_state();
 
-    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-      int qindex_delta, segment_rdmult;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
 
-      if (Q_RATIO(i) == 1) {
-        // No need to enable SEG_LVL_ALT_Q for this segment
-        RDMULT_RATIO(i) = 1;
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
         continue;
       }
 
-      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i),
-                                        cm->bit_depth);
-      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
-
-      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                           cm->y_dc_delta_q);
-
-      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     }
   }
 }
@@ -159,12 +132,19 @@
   }
 }
 
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var = block_variance(cpi, x, bs);
+  vp9_clear_system_state();
+  return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   double energy;
-  unsigned int var = block_variance(cpi, x, bs);
-
+  double energy_midpoint;
   vp9_clear_system_state();
-
-  energy = 0.9 * (log(var + 1.0) - 10.0);
+  energy_midpoint =
+    (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
diff --git a/vp9/encoder/vp9_aq_variance.h b/vp9/encoder/vp9_aq_variance.h
index 73ce2eb..a0effa3 100644
--- a/vp9/encoder/vp9_aq_variance.h
+++ b/vp9/encoder/vp9_aq_variance.h
@@ -19,12 +19,10 @@
 #endif
 
 unsigned int vp9_vaq_segment_id(int energy);
-double vp9_vaq_rdmult_ratio(int energy);
-
-void vp9_vaq_init();
 void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 31683da..4927259 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -807,10 +807,8 @@
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
-  double rdmult_ratio;
 
   vp9_clear_system_state();
-  rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -851,6 +849,7 @@
   if (aq_mode == VARIANCE_AQ) {
     const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
                                             : vp9_block_energy(cpi, x, bsize);
+    int segment_qindex;
     if (cm->frame_type == KEY_FRAME ||
         cpi->refresh_alt_ref_frame ||
         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
@@ -860,11 +859,11 @@
                                                     : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-
-    rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
     vp9_init_plane_quantizers(cpi, x);
     vp9_clear_system_state();
-    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
+    segment_qindex = vp9_get_qindex(&cm->seg, mbmi->segment_id,
+                                    cm->base_qindex);
+    x->rdmult = vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
   } else if (aq_mode == COMPLEXITY_AQ) {
     const int mi_offset = mi_row * cm->mi_cols + mi_col;
     unsigned char complexity = cpi->complexity_map[mi_offset];
@@ -898,12 +897,6 @@
     }
   }
 
-  if (aq_mode == VARIANCE_AQ && rd_cost->rate != INT_MAX) {
-    vp9_clear_system_state();
-    rd_cost->rate = (int)round(rd_cost->rate * rdmult_ratio);
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
-  }
-
   x->rdmult = orig_rdmult;
 
   // TODO(jingning) The rate-distortion optimization flow needs to be
@@ -1696,7 +1689,7 @@
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
+      vp9_select_in_frame_q_segment(cpi, bsize, mi_row, mi_col,
                                     output_enabled, chosen_rdc.rate);
     }
     encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
@@ -2435,7 +2428,7 @@
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
     if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+      vp9_select_in_frame_q_segment(cpi, bsize, mi_row, mi_col, output_enabled,
                                     best_rdc.rate);
     encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled,
               bsize, pc_tree);
@@ -2940,7 +2933,7 @@
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
     if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+      vp9_select_in_frame_q_segment(cpi, bsize, mi_row, mi_col, output_enabled,
                                     best_rdc.rate);
     }
     encode_sb_rt(cpi, tile_info, tp, mi_row, mi_col, output_enabled,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e1022c4..877e2c3 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3720,10 +3720,6 @@
     set_frame_size(cpi);
   }
 
-  if (oxcf->aq_mode == VARIANCE_AQ) {
-    vp9_vaq_init();
-  }
-
   for (i = 0; i < MAX_REF_FRAMES; ++i)
     cpi->scaled_ref_idx[i] = INVALID_REF_BUFFER_IDX;
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 2e33776..8f14d4c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2457,6 +2457,15 @@
 
   rc->base_frame_target = target_rate;
 
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+      log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
+
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0a8f756..a8e4987 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -96,6 +96,7 @@
   double modified_error_min;
   double modified_error_max;
   double modified_error_left;
+  double mb_av_energy;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 23575b0..3d1d1b4 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -460,6 +460,10 @@
   {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
 };
 
+static const PREDICTION_MODE intra_mode_list[] = {
+  DC_PRED, V_PRED, H_PRED, TM_PRED
+};
+
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -796,11 +800,11 @@
   // threshold.
   if (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
-    PREDICTION_MODE this_mode;
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
     const TX_SIZE intra_tx_size =
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    int i;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
@@ -813,17 +817,18 @@
     }
     pd->dst = orig_dst;
 
-    // Change the limit of this loop to add other intra prediction
-    // mode tests.
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+    for (i = 0; i < 4; ++i) {
       const TX_SIZE saved_tx_size = mbmi->tx_size;
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
+        continue;
+      skip_txfm = x->skip_txfm[0];
       args.mode = this_mode;
       args.rate = 0;
       args.dist = 0;
       mbmi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
-      mbmi->tx_size = saved_tx_size;
       this_rdc.rate = args.rate;
       this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
@@ -840,6 +845,7 @@
         mbmi->mv[0].as_int = INVALID_MV;
       } else {
         x->skip_txfm[0] = skip_txfm;
+        mbmi->tx_size = saved_tx_size;
       }
     }
   }
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6b9572f..27c8829 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -304,16 +304,19 @@
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
     sf->partition_search_breakout_rate_thr = 200;
+    if (!is_keyframe) {
+      int i;
+      if (content == VP9E_CONTENT_SCREEN) {
+        for (i = 0; i < TX_SIZES; ++i)
+          sf->intra_y_mode_mask[i] = INTRA_DC_TM_H_V;
+      } else {
+        for (i = 0; i < TX_SIZES; i++)
+          sf->intra_y_mode_mask[i] = INTRA_DC;
+      }
+    }
   }
 
   if (speed >= 6) {
-    if (content == VP9E_CONTENT_SCREEN) {
-      int i;
-      // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
-      for (i = 0; i < BLOCK_SIZES; ++i)
-        sf->inter_mode_mask[i] = INTER_NEAREST_NEAR_NEW;
-    }
-
     // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
     sf->partition_search_type = VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
diff --git a/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000..c245cca
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000..55c6ed7
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "vp9/common/vp9_common.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// from vp9_idct.h: typedef int32_t tran_low_t;
+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
+                                intptr_t count,
+                                int skip_block,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr,
+                                int zbin_oq_value,
+                                uint16_t *eob_ptr,
+                                const int16_t *scan,
+                                const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[1] + zbin_oq_value),
+                           (int)(zbin_ptr[0] + zbin_oq_value));
+  zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = ((int)count / 4) - 1; i >= 0; i--) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (test == 0xffff)
+        non_zero_regs--;
+      else
+        break;
+    }
+
+    // Quantization pass:
+    for (i = 0; i < non_zero_regs; i++) {
+      __m128i coeffs, coeffs_sign, tmp1, tmp2;
+      int test;
+      int abs_coeff[4];
+      int coeff_sign[4];
+
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      coeffs_sign = _mm_srai_epi32(coeffs, 31);
+      coeffs = _mm_sub_epi32(
+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+      tmp1 = _mm_or_si128(tmp1, tmp2);
+      test = _mm_movemask_epi8(tmp1);
+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+      for (j = 0; j < 4; j++) {
+        if (test & (1 << (4 * j))) {
+          int k = 4 * i + j;
+          int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
+                              INT32_MIN, INT32_MAX);
+          tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
+                    quant_shift_ptr[k != 0]) >> 16;  // quantization
+          qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+          if (tmp)
+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+        }
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+
+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+                                      intptr_t n_coeffs,
+                                      int skip_block,
+                                      const int16_t *zbin_ptr,
+                                      const int16_t *round_ptr,
+                                      const int16_t *quant_ptr,
+                                      const int16_t *quant_shift_ptr,
+                                      tran_low_t *qcoeff_ptr,
+                                      tran_low_t *dqcoeff_ptr,
+                                      const int16_t *dequant_ptr,
+                                      int zbin_oq_value,
+                                      uint16_t *eob_ptr,
+                                      const int16_t *scan,
+                                      const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
+                           (zbin1_tmp + zbin_oq_value),
+                           (zbin1_tmp + zbin_oq_value),
+                           (zbin0_tmp + zbin_oq_value));
+  zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs / 4; i++) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (!(test & 0xf))
+        idx_arr[idx++] = i * 4;
+      if (!(test & 0xf0))
+        idx_arr[idx++] = i * 4 + 1;
+      if (!(test & 0xf00))
+        idx_arr[idx++] = i * 4 + 2;
+      if (!(test & 0xf000))
+        idx_arr[idx++] = i * 4 + 3;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = idx_arr[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff +
+                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                          INT32_MIN, INT32_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dad994c..05c1731 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -105,6 +105,8 @@
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)