Merge "ANS: Remove extra buffer size checks causing a false decode error." into nextgenv2
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index a3f0152..48edf1e 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -15,12 +15,16 @@
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #if CONFIG_VP10
 #include "vp10/common/blockd.h"
 #elif CONFIG_VP9
 #include "vp9/common/vp9_blockd.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define USE_SPEED_TEST (0)
 
 typedef void (*SubtractFunc)(int rows, int cols,
                              int16_t *diff_ptr, ptrdiff_t diff_stride,
@@ -108,4 +112,151 @@
 INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_msa));
 #endif
+
+typedef void (*HBDSubtractFunc)(int rows, int cols,
+                                int16_t *diff_ptr, ptrdiff_t diff_stride,
+                                const uint8_t *src_ptr, ptrdiff_t src_stride,
+                                const uint8_t *pred_ptr, ptrdiff_t pred_stride,
+                                int bd);
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void RunForSpeed();
+  void CheckResult();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VP10HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+  }
+}
+
+void VP10HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VP10HBDSubtractBlockTest, CheckResult) {
+  CheckResult();
+}
+
+#if USE_SPEED_TEST
+TEST_P(VP10HBDSubtractBlockTest, CheckSpeed) {
+  RunForSpeed();
+}
+#endif  // USE_SPEED_TEST
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP10HBDSubtractBlockTest, ::testing::Values(
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_c)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 47a3219..b06a5e9 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -19,6 +19,23 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size) {
+  (void) tx_type;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->bd == BITDEPTH_10) {
+      return 0;
+    } else {
+      return tx_size == TX_32X32;
+    }
+  }
+#else
+  (void)xd;
+#endif
+  return tx_size == TX_32X32;
+}
+
 #if CONFIG_EXT_TX
 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   int i;
diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index 31b26b8..ffdad0c 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "vp10/common/blockd.h"
 #include "vp10/common/common.h"
 #include "vp10/common/enums.h"
 #include "vpx_dsp/inv_txfm.h"
@@ -48,6 +49,10 @@
 } highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#define MAX_TX_SCALE 1
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size);
+
 void vp10_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 7b20239..1d227dd 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -662,11 +662,11 @@
   add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp10_highbd_block_error sse2/;
 
-  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale";
   specialize qw/vp10_highbd_quantize_fp/;
 
-  add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_highbd_quantize_fp_32x32/;
+  add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale";
+  specialize qw/vp10_highbd_quantize_b/;
 
   # fdct functions
   add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index b8d409a..58cd9e6 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -15,9 +15,7 @@
 #include "vp10/common/blockd.h"
 #include "vp10/common/common.h"
 #include "vp10/common/entropy.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
 #include "vp10/common/idct.h"
-#endif
 
 #include "vp10/decoder/detokenize.h"
 
@@ -113,15 +111,7 @@
   cat6_prob = vp10_cat6_prob;
 #endif
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
-    dq_shift = 0;
-  } else {
-    dq_shift = (tx_size == TX_32X32);
-  }
-#else
-  dq_shift = (tx_size == TX_32X32);
-#endif
+  dq_shift = get_tx_scale(xd, 0, tx_size);
 
   while (c < max_eob) {
     int val = -1;
@@ -257,15 +247,7 @@
   const uint8_t *cat5_prob;
   const uint8_t *cat6_prob;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
-    dq_shift = 0;
-  } else {
-    dq_shift = (tx_size == TX_32X32);
-  }
-#else
-  dq_shift = (tx_size == TX_32X32);
-#endif
+  dq_shift = get_tx_scale(xd, 0, tx_size);
 
   if (counts) {
     coef_counts = counts->coef[tx_size][type][ref];
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index da1885d..7f3b6a2 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -2313,8 +2313,8 @@
     vp10_copy(eob_counts_copy, cm->counts.eob_branch);
     for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
       for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-        full_to_model_counts(cm->counts.coef[tx_size],
-                             subframe_stats->coef_counts_buf[i][tx_size]);
+        vp10_full_to_model_counts(cm->counts.coef[tx_size],
+                                  subframe_stats->coef_counts_buf[i][tx_size]);
       vp10_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
       vp10_partial_adapt_probs(cm, 0, 0);
       vp10_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 88e9486..06463c1 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -49,6 +49,12 @@
 #include "vp10/encoder/segmentation.h"
 #include "vp10/encoder/tokenize.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+# define IF_HBD(...) __VA_ARGS__
+#else
+# define IF_HBD(...)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void encode_superblock(VP10_COMP *cpi, ThreadData * td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -413,234 +419,102 @@
   }
 }
 
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int log2_count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-typedef struct {
-  partition_variance part_variances;
-  var split[4];
-} v4x4;
-
-typedef struct {
-  partition_variance part_variances;
-  v4x4 split[4];
-} v8x8;
-
-typedef struct {
-  partition_variance part_variances;
-  v8x8 split[4];
-} v16x16;
-
-typedef struct {
-  partition_variance part_variances;
-  v16x16 split[4];
-} v32x32;
-
-typedef struct {
-  partition_variance part_variances;
-  v32x32 split[4];
-} v64x64;
-
-#if CONFIG_EXT_PARTITION
-typedef struct {
-  partition_variance part_variances;
-  v64x64 split[4];
-} v128x128;
-#endif  // CONFIG_EXT_PARTITION
-
-typedef struct {
-  partition_variance *part_variances;
-  var *split[4];
-} variance_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-#if CONFIG_EXT_PARTITION
-  V128X128,
-#endif  // CONFIG_EXT_PARTITION
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
-  int i;
-  node->part_variances = NULL;
-  switch (bsize) {
-#if CONFIG_EXT_PARTITION
-    case BLOCK_128X128: {
-      v128x128 *vt = (v128x128 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-#endif  // CONFIG_EXT_PARTITION
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_4X4: {
-      v4x4 *vt = (v4x4 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i];
-      break;
-    }
-    default: {
-      assert(0);
-      break;
-    }
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->log2_count = c;
-}
-
-static void get_variance(var *v) {
-  v->variance = (int)(256 * (v->sum_square_error -
-      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
-}
-
-static void sum_2_variances(const var *a, const var *b, var *r) {
-  assert(a->log2_count == b->log2_count);
-  fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  variance_node node;
-  memset(&node, 0, sizeof(node));
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
-  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
-  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
-  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
-  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
-                  &node.part_variances->none);
-}
-
-static int set_vt_partitioning(VP10_COMP *cpi,
+static void set_vt_partitioning(VP10_COMP *cpi,
                                MACROBLOCK *const x,
                                MACROBLOCKD *const xd,
-                               void *data,
-                               BLOCK_SIZE bsize,
+                               VAR_TREE *vt,
                                int mi_row,
                                int mi_col,
-                               int64_t threshold,
-                               BLOCK_SIZE bsize_min,
-                               int force_split) {
+                               const int64_t *const threshold,
+                               const BLOCK_SIZE *const bsize_min) {
   VP10_COMMON * const cm = &cpi->common;
-  variance_node vt;
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  const int hbw = num_8x8_blocks_wide_lookup[vt->bsize] / 2;
+  const int hbh = num_8x8_blocks_high_lookup[vt->bsize] / 2;
+  const int has_cols = mi_col + hbw < cm->mi_cols;
+  const int has_rows = mi_row + hbh < cm->mi_rows;
 
-  assert(block_height == block_width);
-  tree_to_node(data, bsize, &vt);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-  if (force_split == 1)
-    return 0;
+  assert(vt->bsize >= BLOCK_8X8);
+
+  assert(hbh == hbw);
+
+  if (vt->force_split || (!has_cols && !has_rows))
+    goto split;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
-  if (bsize == bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+  if (vt->bsize == bsize_min[0]) {
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    } else {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+      if (vt->bsize > BLOCK_8X8) {
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+      }
+      return;
     }
-    return 0;
-  } else if (bsize > bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
+  } else if (vt->bsize > bsize_min[0]) {
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
-        (bsize > BLOCK_32X32 ||
-        vt.part_variances->none.variance > (threshold << 4))) {
-      return 0;
+        (vt->bsize > BLOCK_32X32 ||
+        vt->variances.none.variance > (threshold[0] << 4))) {
+      goto split;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
     }
 
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      get_variance(&vt.part_variances->vert[0]);
-      get_variance(&vt.part_variances->vert[1]);
-      if (vt.part_variances->vert[0].variance < threshold &&
-          vt.part_variances->vert[1].variance < threshold &&
+    if (has_rows) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+      if (vt->variances.vert[0].variance < threshold[0] &&
+          vt->variances.vert[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        return;
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      get_variance(&vt.part_variances->horz[0]);
-      get_variance(&vt.part_variances->horz[1]);
-      if (vt.part_variances->horz[0].variance < threshold &&
-          vt.part_variances->horz[1].variance < threshold &&
+    if (has_cols) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+      if (vt->variances.horz[0].variance < threshold[0] &&
+          vt->variances.horz[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        return;
       }
     }
-
-    return 0;
   }
-  return 0;
+
+split:
+  {
+    set_vt_partitioning(cpi, x, xd, vt->split[0],
+                        mi_row, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[1],
+                        mi_row, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[2],
+                        mi_row + hbh, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[3],
+                        mi_row + hbh, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    return;
+  }
 }
 
 // Set the variance split thresholds for following the block sizes:
@@ -654,23 +528,24 @@
   const int64_t threshold_base = (int64_t)(threshold_multiplier *
       cpi->y_dequant[q][1]);
   if (is_key_frame) {
-    thresholds[0] = threshold_base;
-    thresholds[1] = threshold_base >> 2;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base << 2;
-  } else {
     thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    thresholds[2] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
-      thresholds[2] = threshold_base << 3;
+      thresholds[1] = threshold_base >> 2;
+      thresholds[3] = threshold_base << 3;
     } else {
-      thresholds[0] = threshold_base;
-      thresholds[1] = (5 * threshold_base) >> 2;
+      thresholds[1] = threshold_base;
+      thresholds[2] = (5 * threshold_base) >> 2;
       if (cm->width >= 1920 && cm->height >= 1080)
-        thresholds[1] = (7 * threshold_base) >> 2;
-      thresholds[2] = threshold_base << cpi->oxcf.speed;
+        thresholds[2] = (7 * threshold_base) >> 2;
+      thresholds[3] = threshold_base << cpi->oxcf.speed;
     }
   }
+  thresholds[0] = INT64_MIN;
 }
 
 void vp10_set_variance_partition_thresholds(VP10_COMP *cpi, int q) {
@@ -699,10 +574,10 @@
 }
 
 // Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
-                              int dp, int x16_idx, int y16_idx,
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
 #if CONFIG_VP9_HIGHBITDEPTH
-                              int highbd_flag,
+                              int highbd,
 #endif
                               int pixels_wide,
                               int pixels_high) {
@@ -711,24 +586,26 @@
   int minmax_min = 255;
   // Loop over the 4 8x8 subblocks.
   for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+    const int x8_idx = ((k & 1) << 3);
+    const int y8_idx = ((k >> 1) << 3);
     int min = 0;
     int max = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      const int src_offset = y8_idx * src_stride + x8_idx;
+      const int ref_offset = y8_idx * ref_stride + x8_idx;
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                              d + y8_idx * dp + x8_idx, dp,
+      if (highbd) {
+        vpx_highbd_minmax_8x8(src + src_offset, src_stride,
+                              ref + ref_offset, ref_stride,
                               &min, &max);
       } else {
-        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                       d + y8_idx * dp + x8_idx, dp,
+        vpx_minmax_8x8(src + src_offset, src_stride,
+                       ref + ref_offset, ref_stride,
                        &min, &max);
       }
 #else
-      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                     d + y8_idx * dp + x8_idx, dp,
+      vpx_minmax_8x8(src + src_offset, src_stride,
+                     ref + ref_offset, ref_stride,
                      &min, &max);
 #endif
       if ((max - min) > minmax_max)
@@ -740,110 +617,252 @@
   return (minmax_max - minmax_min);
 }
 
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x4_idx = x8_idx + ((k & 1) << 2);
-    int y4_idx = y8_idx + ((k >> 1) << 2);
-    unsigned int sse = 0;
-    int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      } else {
-        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      }
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_4x4(src, stride);
+  } else {
+    return vpx_avg_4x4(src, stride);
+  }
+}
 #else
-      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+  return vpx_avg_4x4(src, stride);
+}
 #endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
-    }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_8x8(src, stride);
+  } else {
+    return vpx_avg_8x8(src, stride);
+  }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+  return vpx_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               const int highbd,
+#endif
+                               BLOCK_SIZE bsize,
+                               BLOCK_SIZE leaf_size,
+                               const int width, const int height,
+                               const uint8_t *const src, const int src_stride,
+                               const uint8_t *const ref, const int ref_stride) {
+  assert(bsize >= leaf_size);
+
+  vt->bsize = bsize;
+
+  vt->force_split = 0;
+
+  vt->src = src;
+  vt->src_stride = src_stride;
+  vt->ref = ref;
+  vt->ref_stride = ref_stride;
+
+  vt->width = width;
+  vt->height = height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vt->highbd = highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (bsize > leaf_size) {
+    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    const int px = num_4x4_blocks_wide_lookup[subsize] * 4;
+
+    init_variance_tree(vt->split[0],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), VPXMIN(px, height),
+                       src, src_stride,
+                       ref, ref_stride);
+    init_variance_tree(vt->split[1],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, VPXMIN(px, height),
+                       src + px, src_stride,
+                       ref + px, ref_stride);
+    init_variance_tree(vt->split[2],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), height - px,
+                       src + px * src_stride, src_stride,
+                       ref + px * ref_stride, ref_stride);
+    init_variance_tree(vt->split[3],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, height - px,
+                       src + px * src_stride + px, src_stride,
+                       ref + px * ref_stride + px, ref_stride);
   }
 }
 
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt,
+                               const BLOCK_SIZE leaf_size) {
+  if (vt->bsize > leaf_size) {
+    fill_variance_tree(vt->split[0], leaf_size);
+    fill_variance_tree(vt->split[1], leaf_size);
+    fill_variance_tree(vt->split[2], leaf_size);
+    fill_variance_tree(vt->split[3], leaf_size);
+    fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
     unsigned int sse = 0;
     int sum = 0;
-    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      } else {
-        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      }
-#else
-      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
+    int src_avg;
+    int ref_avg;
+    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+    if (leaf_size == BLOCK_4X4) {
+      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    } else {
+      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
     }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+    sum = src_avg - ref_avg;
+    sse = sum * sum;
+    fill_variance(sse, sum, 0, &vt->variances.none);
   }
 }
 
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize >= BLOCK_8X8) {
+    if (vt->bsize == BLOCK_16X16) {
+      if (vt->variances.none.variance <= threshold)
+        return;
+      else
+        vt->force_split = 0;
+    }
+
+    refine_variance_tree(vt->split[0], threshold);
+    refine_variance_tree(vt->split[1], threshold);
+    refine_variance_tree(vt->split[2], threshold);
+    refine_variance_tree(vt->split[3], threshold);
+
+    if (vt->bsize <= BLOCK_16X16)
+      fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    const int sum = src_avg - ref_avg;
+    const unsigned int sse =  sum * sum;
+    assert(vt->bsize == BLOCK_4X4);
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt,
+                                 const int64_t threshold) {
+  if (vt->bsize == BLOCK_32X32) {
+    vt->force_split = vt->variances.none.variance > threshold;
+  } else {
+    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+  }
+  return vt->force_split;
+}
+
+static int check_split(VP10_COMP *const cpi,
+                       VAR_TREE *const vt,
+                       const int segment_id,
+                       const int64_t *const thresholds
+                       ) {
+  if (vt->bsize == BLOCK_16X16) {
+    vt->force_split = vt->variances.none.variance > thresholds[0];
+    if (!vt->force_split &&
+        vt->variances.none.variance > thresholds[-1] &&
+         !cyclic_refresh_segment_id_boosted(segment_id)) {
+      // We have some nominal amount of 16x16 variance (based on average),
+      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+      // force split to 8x8 block for this 16x16 block.
+      int minmax = compute_minmax_8x8(vt->src, vt->src_stride,
+                                      vt->ref, vt->ref_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      vt->highbd,
+#endif
+                                      vt->width, vt->height);
+      vt->force_split = minmax > cpi->vbp_threshold_minmax;
+    }
+  } else {
+    vt->force_split |= check_split(cpi, vt->split[0],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[1],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[2],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[3],
+                                   segment_id, thresholds + 1);
+
+    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+      vt->force_split = vt->variances.none.variance > thresholds[0];
+    }
+  }
+
+  return vt->force_split;
+}
+
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-static int choose_partitioning(VP10_COMP *cpi,
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(VP10_COMP *const cpi,
+                                ThreadData *const td,
                                 const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
-  VP10_COMMON * const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int i, j, k, m;
-  v64x64 vt;
-  v16x16 vt2[16];
-  int force_split[21];
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
+                                MACROBLOCK *const x,
+                                const int mi_row, const int mi_col) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+  int i;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
   int pixels_wide = 8 * num_8x8_blocks_wide_lookup[cm->sb_size];
   int pixels_high = 8 * num_8x8_blocks_high_lookup[cm->sb_size];
-  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
-      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+  int64_t thresholds[5] = {
+    cpi->vbp_thresholds[0],
+    cpi->vbp_thresholds[1],
+    cpi->vbp_thresholds[2],
+    cpi->vbp_thresholds[3],
+    cpi->vbp_thresholds[4],
+  };
+  BLOCK_SIZE bsize_min[5] = {
+      BLOCK_16X16,
+      BLOCK_16X16,
+      BLOCK_16X16,
+      cpi->vbp_bsize_min,
+      BLOCK_8X8
+  };
+  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+  const int64_t *const thre = thresholds + start_level;
+  const BLOCK_SIZE *const bmin = bsize_min + start_level;
 
-  // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[16];
 
   int segment_id = CR_SEGMENT_ID_BASE;
 
@@ -858,11 +877,6 @@
     }
   }
 
-#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
-  printf("Not yet implemented: choose_partitioning\n");
-  exit(-1);
-#endif  // CONFIG_EXT_PARTITION
-
   set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
 
   if (xd->mb_to_right_edge < 0)
@@ -870,33 +884,31 @@
   if (xd->mb_to_bottom_edge < 0)
     pixels_high += (xd->mb_to_bottom_edge >> 3);
 
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
+  src = x->plane[0].src.buf;
+  src_stride = x->plane[0].src.stride;
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
 
-    const int max_mi_block_size = cm->mib_size;
-    const int is_right_edge = mi_col + max_mi_block_size / 2 > cm->mi_cols;
-    const int is_left_edge = mi_row + max_mi_block_size / 2 > cm->mi_rows;
+    const int hbs = cm->mib_size / 2;
+    const int split_vert = mi_col + hbs >= cm->mi_cols;
+    const int split_horz = mi_row + hbs >= cm->mi_rows;
     BLOCK_SIZE bsize;
 
-    if (is_right_edge && is_left_edge)
+    if (split_vert && split_horz)
       bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
-    else if (is_right_edge)
+    else if (split_vert)
       bsize = get_subsize(cm->sb_size, PARTITION_VERT);
-    else if (is_left_edge)
+    else if (split_horz)
       bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
     else
       bsize = cm->sb_size;
 
     assert(yv12 != NULL);
-    yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
     if (yv12_g && yv12_g != yv12) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -918,6 +930,7 @@
     mbmi->interp_filter = BILINEAR;
 
     y_sad = vp10_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
     if (y_sad_g < y_sad) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -944,196 +957,65 @@
       x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
+    ref = xd->plane[0].dst.buf;
+    ref_stride = xd->plane[0].dst.stride;
 
     // If the y_sad is very small, take the largest partition and exit.
     // Don't check on boosted segment for now, as largest is suppressed there.
     if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
-      if (!is_right_edge && !is_left_edge) {
+      if (!split_vert && !split_horz) {
         set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
-        return 0;
+        return;
       }
     }
   } else {
-    d = VP10_VAR_OFFS;
-    dp = 0;
+    ref = VP10_VAR_OFFS;
+    ref_stride = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       switch (xd->bd) {
         case 10:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
           break;
         case 12:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
           break;
         case 8:
         default:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
           break;
       }
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    const int i2 = i << 2;
-    force_split[i + 1] = 0;
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      const int split_index = 5 + i2 + j;
-      v16x16 *vst = &vt.split[i].split[j];
-      force_split[split_index] = 0;
-      variance4x4downsample[i2 + j] = 0;
-      if (!is_key_frame) {
-        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+  init_variance_tree(vt,
 #if CONFIG_VP9_HIGHBITDEPTH
-                            xd->cur_buf->flags,
-#endif
-                            pixels_wide,
-                            pixels_high,
-                            is_key_frame);
-        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-        get_variance(&vt.split[i].split[j].part_variances.none);
-        if (vt.split[i].split[j].part_variances.none.variance >
-            thresholds[2]) {
-          // 16X16 variance is above threshold for split, so force split to 8x8
-          // for this 16x16 block (this also forces splits for upper levels).
-          force_split[split_index] = 1;
-          force_split[i + 1] = 1;
-          force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
-                   thresholds[1] &&
-                   !cyclic_refresh_segment_id_boosted(segment_id)) {
-          // We have some nominal amount of 16x16 variance (based on average),
-          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
-          // force split to 8x8 block for this 16x16 block.
-          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                          xd->cur_buf->flags,
-#endif
-                                          pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
-            force_split[split_index] = 1;
-            force_split[i + 1] = 1;
-            force_split[0] = 1;
-          }
-        }
-      }
-      if (is_key_frame || (low_res &&
-          vt.split[i].split[j].part_variances.none.variance >
-          (thresholds[1] << 1))) {
-        force_split[split_index] = 0;
-        // Go down to 4x4 down-sampling for variance.
-        variance4x4downsample[i2 + j] = 1;
-        for (k = 0; k < 4; k++) {
-          int x8_idx = x16_idx + ((k & 1) << 3);
-          int y8_idx = y16_idx + ((k >> 1) << 3);
-          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
-              &vt2[i2 + j].split[k];
-          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               xd->cur_buf->flags,
-#endif
-                               pixels_wide,
-                               pixels_high,
-                               is_key_frame);
-        }
-      }
+                     xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                     cm->sb_size,
+                     (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+                     pixels_wide, pixels_high,
+                     src, src_stride, ref, ref_stride);
+
+  // Fill in the entire tree of variances and compute splits.
+  if (is_key_frame)  {
+    fill_variance_tree(vt, BLOCK_4X4);
+    check_split_key_frame(vt, thre[1]);
+  } else {
+    fill_variance_tree(vt, BLOCK_8X8);
+    check_split(cpi, vt, segment_id, thre);
+    if (low_res) {
+      refine_variance_tree(vt, thre[1] << 1);
     }
   }
 
-  // Fill the rest of the variance tree by summing split partition values.
-  for (i = 0; i < 4; i++) {
-    const int i2 = i << 2;
-    for (j = 0; j < 4; j++) {
-      if (variance4x4downsample[i2 + j] == 1) {
-        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
-            &vt.split[i].split[j];
-        for (m = 0; m < 4; m++)
-          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
-        fill_variance_tree(vtemp, BLOCK_16X16);
-      }
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
-    if (!force_split[i + 1]) {
-      get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
-        force_split[i + 1] = 1;
-        force_split[0] = 1;
-      }
-    }
-  }
-  if (!force_split[0]) {
-    fill_variance_tree(&vt, BLOCK_64X64);
-    get_variance(&vt.part_variances.none);
-  }
+  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+                     mi_row + cm->mib_size > cm->mi_rows;
 
   // Now go through the entire structure, splitting every block size until
   // we get to one that's got a variance lower than our threshold.
-  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           thresholds[0], BLOCK_16X16, force_split[0])) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      const int i2 = i << 2;
-      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx),
-                               thresholds[1], BLOCK_16X16,
-                               force_split[i + 1])) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
-          // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
-          v16x16 *vtemp = (!is_key_frame &&
-                           variance4x4downsample[i2 + j] == 1) ?
-                           &vt2[i2 + j] : &vt.split[i].split[j];
-          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
-                                   mi_row + y32_idx + y16_idx,
-                                   mi_col + x32_idx + x16_idx,
-                                   thresholds[2],
-                                   cpi->vbp_bsize_min,
-                                   force_split[5 + i2  + j])) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              if (use_4x4_partition) {
-                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
-                                         BLOCK_8X8,
-                                         mi_row + y32_idx + y16_idx + y8_idx,
-                                         mi_col + x32_idx + x16_idx + x8_idx,
-                                         thresholds[3], BLOCK_8X8, 0)) {
-                  set_block_size(cpi, x, xd,
-                                 (mi_row + y32_idx + y16_idx + y8_idx),
-                                 (mi_col + x32_idx + x16_idx + x8_idx),
-                                 BLOCK_4X4);
-                }
-              } else {
-                set_block_size(cpi, x, xd,
-                               (mi_row + y32_idx + y16_idx + y8_idx),
-                               (mi_col + x32_idx + x16_idx + x8_idx),
-                               BLOCK_8X8);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return 0;
+  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
 }
 
 static void update_state(VP10_COMP *cpi, ThreadData *td,
@@ -2596,10 +2478,6 @@
   int chosen_rate_nocoef = INT_MAX;
 #endif
 
-#if CONFIG_EXT_PARTITION_TYPES
-  assert(0);
-#endif
-
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -2823,6 +2701,13 @@
 #endif
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+      assert(0 && "Cannot handle extended partiton types");
+#endif  //  CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
       break;
@@ -4282,9 +4167,8 @@
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                        1, pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME) {
-      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                        cm->sb_size, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
@@ -4318,8 +4202,8 @@
       SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
 
       for (t = TX_4X4; t <= TX_32X32; ++t)
-        full_to_model_counts(cpi->td.counts->coef[t],
-                             cpi->td.rd_counts.coef_counts[t]);
+        vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                                  cpi->td.rd_counts.coef_counts[t]);
       vp10_partial_adapt_probs(cm, mi_row, mi_col);
       ++cm->coef_probs_update_idx;
       vp10_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
@@ -4328,7 +4212,7 @@
                 cpi->td.rd_counts.coef_counts);
       vp10_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
                 cm->counts.eob_branch);
-      fill_token_costs(x->token_costs, cm->fc->coef_probs);
+      vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
     }
   }
 #endif  // CONFIG_ENTROPY
@@ -4553,6 +4437,10 @@
 #endif
 #endif
 
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+      cpi->td.var_root[0] == NULL)
+    vp10_setup_var_tree(&cpi->common, &cpi->td);
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 10e97cb..9acf00c 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -129,15 +129,7 @@
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
-    mul = 1;
-  } else {
-    mul = 1 + (tx_size == TX_32X32);
-  }
-#else
-  mul = 1 + (tx_size == TX_32X32);
-#endif
+  mul = 1 << get_tx_scale(xd, tx_type, tx_size);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   if (!ref)
@@ -323,35 +315,29 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef enum QUANT_FUNC {
   QUANT_FUNC_LOWBD = 0,
-  QUANT_FUNC_LOWBD_32 = 1,
-  QUANT_FUNC_HIGHBD = 2,
-  QUANT_FUNC_HIGHBD_32 = 3,
-  QUANT_FUNC_LAST = 4
-} QUANT_FUNC;
-
-static VP10_QUANT_FACADE
-    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
-        {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade,
-         vp10_highbd_quantize_fp_facade, vp10_highbd_quantize_fp_32x32_facade},
-        {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade,
-         vp10_highbd_quantize_b_facade, vp10_highbd_quantize_b_32x32_facade},
-        {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade,
-         vp10_highbd_quantize_dc_facade, vp10_highbd_quantize_dc_32x32_facade},
-        {NULL, NULL, NULL, NULL}};
-
-#else
-typedef enum QUANT_FUNC {
-  QUANT_FUNC_LOWBD = 0,
-  QUANT_FUNC_LOWBD_32 = 1,
+  QUANT_FUNC_HIGHBD = 1,
   QUANT_FUNC_LAST = 2
 } QUANT_FUNC;
 
 static VP10_QUANT_FACADE
     quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
-        {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade},
-        {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade},
-        {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade},
+        {vp10_quantize_fp_facade, vp10_highbd_quantize_fp_facade},
+        {vp10_quantize_b_facade, vp10_highbd_quantize_b_facade},
+        {vp10_quantize_dc_facade, vp10_highbd_quantize_dc_facade},
         {NULL, NULL}};
+
+#else
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_LAST = 1
+} QUANT_FUNC;
+
+static VP10_QUANT_FACADE
+    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+        {vp10_quantize_fp_facade},
+        {vp10_quantize_b_facade},
+        {vp10_quantize_dc_facade},
+        {NULL}};
 #endif
 
 static FWD_TXFM_OPT fwd_txfm_opt_list[VP10_XFORM_QUANT_LAST] = {
@@ -378,7 +364,9 @@
   const int tx2d_size = tx1d_size * tx1d_size;
 
   FWD_TXFM_PARAM fwd_txfm_param;
-  fwd_txfm_param.tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  QUANT_PARAM qparam;
+
+  fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
   fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[xform_quant_idx];
   fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
@@ -386,6 +374,7 @@
 
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
+  qparam.log_scale = get_tx_scale(xd, tx_type, tx_size);
 #if CONFIG_VP9_HIGHBITDEPTH
   fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -394,12 +383,9 @@
       if (x->skip_block) {
         vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
       } else {
-        if (tx_size == TX_32X32 && xd->bd != 10)
-          quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD_32](
-              coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
-        else
-          quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
-              coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+            scan_order, &qparam);
       }
     }
     return;
@@ -411,12 +397,9 @@
     if (x->skip_block) {
       vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
     } else {
-      if (tx_size == TX_32X32)
-        quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD_32](
-            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
-      else
-        quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
-            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+          scan_order, &qparam);
     }
   }
 }
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index a39575b..f0de8ef 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -463,6 +463,9 @@
 
   vp10_free_pc_tree(&cpi->td);
 
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+    vp10_free_var_tree(&cpi->td);
+
   if (cpi->common.allow_screen_content_tools)
     vpx_free(cpi->td.mb.palette_buffer);
 
@@ -1999,6 +2002,8 @@
       CHECK_MEM_ERROR(cm, x->palette_buffer,
                       vpx_memalign(16, sizeof(*x->palette_buffer)));
     }
+    // Reallocate the pc_tree, as it's contents depends on
+    // the state of cm->allow_screen_content_tools
     vp10_free_pc_tree(&cpi->td);
     vp10_setup_pc_tree(&cpi->common, &cpi->td);
   }
@@ -2586,6 +2591,8 @@
         vpx_free(thread_data->td->mb.palette_buffer);
       vpx_free(thread_data->td->counts);
       vp10_free_pc_tree(thread_data->td);
+      if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+        vp10_free_var_tree(thread_data->td);
       vpx_free(thread_data->td);
     }
   }
@@ -3406,13 +3413,9 @@
   model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
 }
 
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
-                                 vp10_coeff_count *full_count) {
-#else
-static void full_to_model_counts(vp10_coeff_count_model *model_count,
-                                 vp10_coeff_count *full_count) {
-#endif  // CONFIG_ENTROPY
+
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count) {
   int i, j, k, l;
 
   for (i = 0; i < PLANE_TYPES; ++i)
@@ -4403,8 +4406,8 @@
   vp10_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
+    vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                              cpi->td.rd_counts.coef_counts[t]);
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
 #if CONFIG_ENTROPY
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index bf7815f..0f0d1f3 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -34,6 +34,7 @@
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/speed_features.h"
 #include "vp10/encoder/tokenize.h"
+#include "vp10/encoder/variance_tree.h"
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp10/encoder/denoiser.h"
@@ -267,6 +268,9 @@
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+  VAR_TREE *var_tree;
+  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 } ThreadData;
 
 struct EncWorkerData;
@@ -568,9 +572,12 @@
   int resize_count;
 
   // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_64x64; 1 - threshold_32x32;
-  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
-  int64_t vbp_thresholds[4];
+  // 0 - threshold_128x128;
+  // 1 - threshold_64x64;
+  // 2 - threshold_32x32;
+  // 3 - threshold_16x16;
+  // 4 - threshold_8x8;
+  int64_t vbp_thresholds[5];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
   BLOCK_SIZE vbp_bsize_min;
@@ -630,10 +637,8 @@
 
 int vp10_get_quantizer(struct VP10_COMP *cpi);
 
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
-                          vp10_coeff_count *full_count);
-#endif  // CONFIG_ENTROPY
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count);
 
 static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) ||
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index 2742ed2..e552ec5 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -93,6 +93,10 @@
         thread_data->td->pc_tree = NULL;
         vp10_setup_pc_tree(cm, thread_data->td);
 
+        // Set up variance tree if needed.
+        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+          vp10_setup_var_tree(cm, &cpi->td);
+
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         vpx_calloc(1, sizeof(*thread_data->td->counts)));
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 3f8f0f4..3919fee 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -33,52 +33,72 @@
                              const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                              const MACROBLOCKD_PLANE *pd,
                              tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                             const scan_order *sc) {
+                             const scan_order *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
 
-  vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                   p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                   pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  if (qparam->log_scale == 0) {
+    vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                     p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                     pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  } else {
+    vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                           p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
+                           dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                           sc->iscan);
+  }
 }
 
 void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                             const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const scan_order *sc) {
+                            const scan_order *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
 
-  vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
-                 p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                 sc->scan, sc->iscan);
+  if (qparam->log_scale == 0) {
+    vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
+                   p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+                   eob_ptr, sc->scan, sc->iscan);
+  } else {
+    vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  }
 }
 
 void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                              const MACROBLOCKD_PLANE *pd,
                              tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                             const scan_order *sc) {
+                             const scan_order *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
   (void)sc;
-  vpx_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
-                  p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                  eob_ptr);
+  if (qparam->log_scale == 0) {
+    vpx_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                    p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                    eob_ptr);
+  } else {
+    vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                          qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_quantize_fp_facade(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
     tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
 
-  vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                          pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                          p->round_fp, p->quant_fp, p->quant_shift,
+                          qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                          sc->scan, sc->iscan, qparam->log_scale);
 }
 
 void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
@@ -86,114 +106,30 @@
                                    tran_low_t *qcoeff_ptr,
                                    const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc) {
+                                   const scan_order *sc,
+                                   const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
 
-  vpx_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                        pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  vp10_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan,
+                         qparam->log_scale);
 }
 
 void vp10_highbd_quantize_dc_facade(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
     tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
 
   (void)sc;
 
-  vpx_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+  vp10_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
                          p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant[0], eob_ptr);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                   tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
-                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant, eob_ptr, sc->scan, sc->iscan);
-}
-
-void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
-                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                  tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
-                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                  const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                       p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                       pd->dequant, eob_ptr, sc->scan, sc->iscan);
-}
-
-void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                   tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
-                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  (void)sc;
-  (void)n_coeffs;
-
-  vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
-                        qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  vp10_highbd_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                p->round_fp, p->quant_fp, p->quant_shift,
-                                qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                                sc->scan, sc->iscan);
-}
-
-void vp10_highbd_quantize_b_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  vpx_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                              dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                              sc->iscan);
-}
-
-void vp10_highbd_quantize_dc_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
-  // obsolete skip_block
-  const int skip_block = 0;
-
-  (void)sc;
-  (void)n_coeffs;
-
-  vpx_highbd_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
-                               qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                               eob_ptr);
+                         pd->dequant[0], eob_ptr, qparam->log_scale);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -250,9 +186,11 @@
                               const int16_t *dequant_ptr,
                               uint16_t *eob_ptr,
                               const int16_t *scan,
-                              const int16_t *iscan) {
+                              const int16_t *iscan, const int log_scale) {
   int i;
   int eob = -1;
+  const int scale = 1 << log_scale;
+  const int shift = 16 - log_scale;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
@@ -271,9 +209,10 @@
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]  / scale;
       if (abs_qcoeff)
         eob = i;
     }
@@ -325,49 +264,101 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *zbin_ptr,
-                                    const int16_t *round_ptr,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *quant_shift_ptr,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr,
-                                    const int16_t *dequant_ptr,
-                                    uint16_t *eob_ptr,
-                                    const int16_t *scan, const int16_t *iscan) {
-  int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+void vp10_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              uint16_t *eob_ptr, const int16_t *scan,
+                              const int16_t *iscan, const int log_scale) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  int round[2] = {round_ptr[0], round_ptr[1]};
+  int nzbins[2];
+  int scale = 1;
+  int shift = 16;
   (void)iscan;
 
+  if (log_scale > 0) {
+    zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale);
+    zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale);
+    round[0] = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    round[1] = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    scale = 1 << log_scale;
+    shift = 16 - log_scale;
+  }
+
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        const int64_t tmp = abs_coeff
-                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
         qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+        if (abs_qcoeff)
+          eob = i;
       }
-
-      if (abs_qcoeff)
-        eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr,
+                            const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
 void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp10/encoder/quantize.h b/vp10/encoder/quantize.h
index 6128460..5e62eb2 100644
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h
@@ -19,12 +19,17 @@
 extern "C" {
 #endif
 
+typedef struct QUANT_PARAM {
+  int log_scale;
+} QUANT_PARAM;
+
 typedef void (*VP10_QUANT_FACADE)(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
                                   const MACROBLOCKD_PLANE *pd,
                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                  const scan_order *sc);
+                                  const scan_order *sc,
+                                  const QUANT_PARAM *qparam);
 
 typedef struct {
   // 0: dc 1: ac 2-8: ac repeated to SIMD width
@@ -48,7 +53,6 @@
 
 void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                  const int16_t *scan, const int16_t *iscan);
-
 struct VP10_COMP;
 struct VP10Common;
 
@@ -71,74 +75,48 @@
                              const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                              const MACROBLOCKD_PLANE *pd,
                              tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                             const scan_order *sc);
+                             const scan_order *sc, const QUANT_PARAM *qparam);
 
 void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                             const MACROBLOCKD_PLANE *pd,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const scan_order *sc);
+                            const scan_order *sc, const QUANT_PARAM *qparam);
 
 void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                              const MACROBLOCKD_PLANE *pd,
                              tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                             const scan_order *sc);
+                             const scan_order *sc, const QUANT_PARAM *qparam);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_quantize_fp_facade(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
     tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam);
 
 void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
                                    const MACROBLOCKD_PLANE *pd,
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc);
+                                   const scan_order *sc,
+                                   const QUANT_PARAM *qparam);
 
 void vp10_highbd_quantize_dc_facade(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
     tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam);
+
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr,
+                            const int log_scale);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                   tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
-                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc);
-
-void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
-                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                  tran_low_t *qcoeff_ptr,
-                                  const MACROBLOCKD_PLANE *pd,
-                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                  const scan_order *sc);
-
-void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-                                   tran_low_t *qcoeff_ptr,
-                                   const MACROBLOCKD_PLANE *pd,
-                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                                   const scan_order *sc);
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-
-void vp10_highbd_quantize_b_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-
-void vp10_highbd_quantize_dc_32x32_facade(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
-    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
-    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index ce9fad7..dc34f1f 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -152,13 +152,8 @@
 #endif  // CONFIG_EXT_INTRA
 }
 
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
-                      vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#else
-static void fill_token_costs(vp10_coeff_cost *c,
-                             vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#endif  // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
   for (t = TX_4X4; t <= TX_32X32; ++t)
@@ -397,7 +392,7 @@
 #endif
   }
   if (cpi->oxcf.pass != 1) {
-    fill_token_costs(x->token_costs, cm->fc->coef_probs);
+    vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
         cm->frame_type == KEY_FRAME) {
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 80749dc..7aad9eb 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -341,10 +341,8 @@
                                 int (*fact)[MAX_MODES], int rd_thresh,
                                 int bsize, int best_mode_index);
 
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
-                      vp10_coeff_probs_model (*p)[PLANE_TYPES]);
-#endif  // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index d4538af..05cb75c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1001,7 +1001,7 @@
     const struct macroblock_plane *const p = &x->plane[plane];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     int64_t this_sse;
-    int shift = tx_size == TX_32X32 ? 0 : 2;
+    int shift = (MAX_TX_SCALE - get_tx_scale(xd, 0, tx_size)) * 2;
     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1175,19 +1175,11 @@
         const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
         const int64_t resd_sse = coeff[0] - dqcoeff[0];
         int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+        int shift = (MAX_TX_SCALE - get_tx_scale(xd, 0, tx_size)) * 2;
 #if CONFIG_VP9_HIGHBITDEPTH
         dc_correct >>= ((xd->bd - 8) * 2);
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH &&
-            xd->bd == BITDEPTH_10) {
-          dc_correct >>= 2;
-        } else {
-          if (tx_size != TX_32X32)
-            dc_correct >>= 2;
-        }
-#else
-        if (tx_size != TX_32X32)
-          dc_correct >>= 2;
 #endif
+        dc_correct >>= shift;
 
         dist = VPXMAX(0, sse - dc_correct);
       }
diff --git a/vp10/encoder/variance_tree.c b/vp10/encoder/variance_tree.c
new file mode 100644
index 0000000..d11ef2d
--- /dev/null
+++ b/vp10/encoder/variance_tree.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/variance_tree.h"
+#include "vp10/encoder/encoder.h"
+
+
+
+void vp10_setup_var_tree(struct VP10Common *cm, ThreadData *td) {
+  int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 1024;
+  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int index = 0;
+  VAR_TREE *this_var;
+  int nodes;
+
+  vpx_free(td->var_tree);
+  CHECK_MEM_ERROR(cm, td->var_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->var_tree)));
+
+  this_var = &td->var_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (index = 0; index < leaf_nodes; ++index) {
+    VAR_TREE *const leaf = &td->var_tree[index];
+    leaf->split[0] = NULL;
+  }
+
+  // Each node has 4 leaf nodes, fill in the child pointers
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i, ++index) {
+      VAR_TREE *const node = &td->var_tree[index];
+      for (j = 0; j < 4; j++)
+        node->split[j] = this_var++;
+    }
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->var_root[i] = &td->var_tree[tree_nodes - 1];
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->var_root[i] = td->var_root[i+1]->split[0];
+  }
+}
+
+void vp10_free_var_tree(ThreadData *td) {
+  vpx_free(td->var_tree);
+  td->var_tree = NULL;
+}
diff --git a/vp10/encoder/variance_tree.h b/vp10/encoder/variance_tree.h
new file mode 100644
index 0000000..a10f7e7
--- /dev/null
+++ b/vp10/encoder/variance_tree.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_VARIANCE_TREE_H_
+#define VP10_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct ThreadData;
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+  int force_split;
+  partition_variance variances;
+  struct VAR_TREE *split[4];
+  BLOCK_SIZE bsize;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int width;
+  int height;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+} VAR_TREE;
+
+void vp10_setup_var_tree(struct VP10Common *cm, struct ThreadData *td);
+void vp10_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static INLINE void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[1]->variances.none,
+                  &vt->variances.horz[0]);
+  sum_2_variances(&vt->split[2]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.horz[1]);
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[2]->variances.none,
+                  &vt->variances.vert[0]);
+  sum_2_variances(&vt->split[1]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.vert[1]);
+  sum_2_variances(&vt->variances.vert[0],
+                  &vt->variances.vert[1],
+                  &vt->variances.none);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_VARIANCE_TREE_H_ */
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 34b766f..d174c8b 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -21,6 +21,8 @@
 VP10_CX_SRCS-yes += encoder/bitwriter.h
 VP10_CX_SRCS-yes += encoder/context_tree.c
 VP10_CX_SRCS-yes += encoder/context_tree.h
+VP10_CX_SRCS-yes += encoder/variance_tree.c
+VP10_CX_SRCS-yes += encoder/variance_tree.h
 VP10_CX_SRCS-yes += encoder/cost.h
 VP10_CX_SRCS-yes += encoder/cost.c
 VP10_CX_SRCS-yes += encoder/dct.c
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 26fe785..d3695a9 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -12,22 +12,22 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 8; ++i, s+=p)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
+  for (i = 0; i < 8; ++i, src += stride)
+    for (j = 0; j < 8; sum += src[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 4; ++i, s+=p)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
+  for (i = 0; i < 4; ++i, src += stride)
+    for (j = 0; j < 4; sum += src[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -176,14 +176,15 @@
   return var;
 }
 
-void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
+                      const uint8_t *ref, int ref_stride,
                       int *min, int *max) {
   int i, j;
   *min = 255;
   *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
+  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
+      int diff = abs(src[j]-ref[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
@@ -191,24 +192,24 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 8; ++i, s += stride)
     for (j = 0; j < 8; sum += s[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s+=stride)
     for (j = 0; j < 4; sum += s[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 1759854..dea2af9 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -23,10 +23,10 @@
 #define FILTER_WEIGHT 128
 
 typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                    const uint8_t *b_ptr, int b_stride);
+                                    const uint8_t *b, int b_stride);
 
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                        const uint8_t *b_ptr, int b_stride,
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
                                         const uint8_t *second_pred);
 
 typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
@@ -50,10 +50,10 @@
                                                 const uint8_t *b, int b_stride,
                                                 unsigned int *sse);
 
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a,
                                                    int a_stride,
                                                    int xoffset, int yoffset,
-                                                   const uint8_t *b_ptr,
+                                                   const uint8_t *b,
                                                    int b_stride,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
@@ -75,26 +75,25 @@
 #endif  // CONFIG_VP8
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
-typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src_ptr,
-                                           int source_stride,
-                                           const uint8_t *ref_ptr,
+typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src,
+                                           int src_stride,
+                                           const uint8_t *ref,
                                            int ref_stride,
                                            const uint8_t *msk_ptr,
                                            int msk_stride);
-typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src_ptr,
-                                                 int source_stride,
-                                                 const uint8_t *ref_ptr,
+typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *ref,
                                                  int ref_stride,
-                                                 const uint8_t *msk_ptr,
+                                                 const uint8_t *msk,
                                                  int msk_stride,
                                                  unsigned int *sse);
-typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src_ptr,
-                                                       int source_stride,
-                                                       int xoffset,
-                                                       int yoffset,
-                                                       const uint8_t *ref_ptr,
-                                                       int Refstride,
-                                                       const uint8_t *msk_ptr,
+typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
+                                                       int src_stride,
+                                                       int xoffset, int yoffset,
+                                                       const uint8_t *ref,
+                                                       int ref_stride,
+                                                       const uint8_t *msk,
                                                        int msk_stride,
                                                        unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a9805d7..46ef5fc 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -266,6 +266,11 @@
 endif
 endif
 
+# high bit depth subtract
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
+endif
+
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
 ifeq ($(CONFIG_VP10_ENCODER),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 10a5280..a648e45 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -965,10 +965,6 @@
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
-}
 
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
   #
@@ -991,6 +987,8 @@
     specialize qw/vpx_highbd_avg_8x8/;
     add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
     specialize qw/vpx_highbd_avg_4x4/;
+    add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/vpx_highbd_subtract_block sse2/;
   }
 
   #
diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000..33e464b
--- /dev/null
+++ b/vpx_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,366 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint16_t *src, ptrdiff_t src_stride,
+    const uint16_t *pred, ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *) (diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *) (diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *) (diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *) (diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 8;
+  src += 8;
+  pred += 8;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 16;
+  src += 16;
+  pred += 16;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 32;
+  src += 32;
+  pred += 32;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 64;
+  src += 64;
+  pred += 64;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+                             const uint16_t *src, ptrdiff_t src_stride,
+                             const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  SubtractWxHFuncType ret_func_ptr = NULL;
+  if (rows == 4) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x4;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x4;
+    }
+  } else if (rows == 8) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x8;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x8;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x8;
+    }
+  } else if (rows == 16) {
+    if (cols == 8) {
+      ret_func_ptr = subtract_8x16;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x16;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x16;
+    }
+  } else if (rows == 32) {
+    if (cols == 16) {
+      ret_func_ptr = subtract_16x32;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x32;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x32;
+    }
+  } else if (rows == 64) {
+    if (cols == 32) {
+      ret_func_ptr = subtract_32x64;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x64;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x64;
+    }
+  } else if (rows == 128) {
+    if (cols == 64) {
+      ret_func_ptr = subtract_64x128;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x128;
+    }
+  }
+  if (!ret_func_ptr) {
+    assert(0);
+  }
+  return ret_func_ptr;
+}
+
+void vpx_highbd_subtract_block_sse2(
+    int rows, int cols,
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint8_t *src8, ptrdiff_t src_stride,
+    const uint8_t *pred8,
+    ptrdiff_t pred_stride,
+    int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void) bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}