Merge "Use reduced transform set for 16x16" into nextgenv2
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 78aabe6..79f4e10 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -923,6 +923,15 @@
                       make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
                       make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxHBDVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
@@ -1125,6 +1134,22 @@
         make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
 #endif  // CONFIG_USE_X86INC
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1, 8),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1, 10),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index e144a45..86a7efc 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -20,7 +20,7 @@
 
 #undef MAX_SB_SIZE
 
-// Pixels per max superblock size
+// Max superblock size
 #if CONFIG_EXT_PARTITION
 # define MAX_SB_SIZE_LOG2 7
 #else
@@ -29,6 +29,9 @@
 #define MAX_SB_SIZE   (1 << MAX_SB_SIZE_LOG2)
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
 // Pixels per Mode Info (MI) unit
 #define MI_SIZE_LOG2  3
 #define MI_SIZE       (1 << MI_SIZE_LOG2)
@@ -37,6 +40,9 @@
 #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
 #define MAX_MIB_SIZE      (1 << MAX_MIB_SIZE_LOG2)
 
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
 // Mask to extract MI offset within max MIB
 #define MAX_MIB_MASK    (MAX_MIB_SIZE - 1)
 #define MAX_MIB_MASK_2  (MAX_MIB_SIZE * 2 - 1)
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index b7c8260..41155c9 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -244,8 +244,16 @@
     }
     ++square_index;
   }
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-  td->pc_root[0].none.best_mode_index = 2;
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i+1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
 }
 
 void vp10_free_pc_tree(ThreadData *td) {
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 6aba475..88e9486 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -4235,6 +4235,7 @@
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < leaf_nodes; ++i)
@@ -4249,7 +4250,7 @@
     }
 
     vp10_zero(x->pred_mv);
-    td->pc_root->index = 0;
+    pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
@@ -4269,7 +4270,7 @@
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
+                       1, pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
       set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
@@ -4280,7 +4281,7 @@
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
+                       1, pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
@@ -4289,7 +4290,7 @@
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
+                       1, pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
@@ -4303,9 +4304,7 @@
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                        INT64_MAX,
-                        cm->sb_size == BLOCK_LARGEST ? td->pc_root
-                                                     : td->pc_root->split[0]);
+                        INT64_MAX, pc_root);
     }
   }
 #if CONFIG_ENTROPY
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 2098378..bf7815f 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -266,7 +266,7 @@
 
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 } ThreadData;
 
 struct EncWorkerData;
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index dd3e437..5936a24 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -491,7 +491,8 @@
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 0c8ec43..9423ed2 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -24,6 +24,7 @@
 
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/rdopt.h"
 
 // #define NEW_DIAMOND_SEARCH
 
@@ -367,8 +368,8 @@
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      vpx_highbd_comp_avg_pred_c(comp_pred16, second_pred, w, h, y + offset,
-                                 y_stride);
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
@@ -2655,6 +2656,29 @@
     v = INT_MAX;                                                       \
   }
 
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_masked_pref_error(xd,                          \
+                                          mask, mask_stride,           \
+                                          vfp, z, src_stride,          \
+                                          upre(y, y_stride, r, c),     \
+                                          y_stride,                    \
+                                          w, h, &sse);    \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
 int vp10_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
                                          const uint8_t *mask, int mask_stride,
                                          MV *bestmv, const MV *ref_mv,
@@ -2671,8 +2695,8 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
-  unsigned int whichdir;
   int thismse;
+  unsigned int whichdir;
   unsigned int halfiters = iters_per_step;
   unsigned int quarteriters = iters_per_step;
   unsigned int eighthiters = iters_per_step;
@@ -2747,6 +2771,276 @@
   return besterr;
 }
 
+static unsigned int setup_masked_center_error(const uint8_t *mask,
+                                              int mask_stride,
+                                              const MV *bestmv,
+                                              const MV *ref_mv,
+                                              int error_per_bit,
+                                              const vp10_variance_fn_ptr_t *vfp,
+                                              const uint8_t *const src,
+                                              const int src_stride,
+                                              const uint8_t *const y,
+                                              int y_stride,
+                                              int offset,
+                                              int *mvjcost, int *mvcost[2],
+                                              unsigned int *sse1,
+                                              int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->mvf(y + offset, y_stride, src, src_stride,
+                     mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+                                       const uint8_t *mask,
+                                       int mask_stride,
+                                       const vp10_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y, int y_stride,
+                                       int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                       mask, mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->mvf(pred, w, src, src_stride,
+                       mask, mask_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+    const MACROBLOCKD *xd,
+    const uint8_t *mask, int mask_stride,
+    const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_masked_pref_error(
+      xd, mask, mask_stride, vfp, src, src_stride,
+      y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_masked_sub_pixel_tree_up(VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_masked_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_masked_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_masked_pref_error(xd,
+                                                mask, mask_stride,
+                                                vfp, src_address, src_stride,
+                                                pre_address, y_stride,
+                                                w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride,
+                              mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_masked_pref_error(xd,
+                                              mask, mask_stride,
+                                              vfp, src_address, src_stride,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 #undef DIST
 #undef MVC
 #undef CHECK_BETTER
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index f99cd8b..c12e7af 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -169,7 +169,24 @@
                                          int iters_per_step,
                                          int *mvjcost, int *mvcost[2],
                                          int *distortion,
-                                         unsigned int *sse1, int is_second);
+                                         unsigned int *sse1,
+                                         int is_second);
+int vp10_find_best_masked_sub_pixel_tree_up(struct VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref);
 int vp10_masked_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
                                    const uint8_t *mask, int mask_stride,
                                    MV *mvp_full, int step_param,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 87836cb..d4538af 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -87,8 +87,8 @@
 const double ext_tx_th = 0.99;
 #endif
 
-const double ADST_FLIP_SVM[8] = {-7.3283, -3.0450, -3.2450, 3.6403,  // vert
-                                 -9.4204, -3.1821, -4.6851, 4.1469};  // horz
+const double ADST_FLIP_SVM[8] = {-6.6623, -2.8062, -3.2531, 3.1671,  // vert
+                                 -7.7051, -3.2234, -3.6193, 3.4533};  // horz
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -355,14 +355,14 @@
 // constants for prune 1 and prune 2 decision boundaries
 #define FAST_EXT_TX_CORR_MID 0.0
 #define FAST_EXT_TX_EDST_MID 0.1
-#define FAST_EXT_TX_CORR_MARGIN 0.5
-#define FAST_EXT_TX_EDST_MARGIN 0.05
+#define FAST_EXT_TX_CORR_MARGIN 0.3
+#define FAST_EXT_TX_EDST_MARGIN 0.5
 
 typedef enum {
   DCT_1D = 0,
   ADST_1D = 1,
   FLIPADST_1D = 2,
-  DST_1D = 3,
+  IDTX_1D = 3,
   TX_TYPES_1D = 4,
 } TX_TYPE_1D;
 
@@ -568,18 +568,18 @@
   }
 }
 
-int dct_vs_dst(int16_t *diff, int stride, int w, int h,
-               double *hcorr, double *vcorr) {
+int dct_vs_idtx(int16_t *diff, int stride, int w, int h,
+                double *hcorr, double *vcorr) {
   int prune_bitmask = 0;
   get_horver_correlation(diff, stride, w, h, hcorr, vcorr);
 
   if (*vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << DST_1D;
+    prune_bitmask |= 1 << IDTX_1D;
   else if (*vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
     prune_bitmask |= 1 << DCT_1D;
 
   if (*hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << (DST_1D + 8);
+    prune_bitmask |= 1 << (IDTX_1D + 8);
   else if (*hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
     prune_bitmask |= 1 << (DCT_1D + 8);
   return prune_bitmask;
@@ -600,7 +600,7 @@
   vp10_subtract_plane(x, bsize, 0);
   return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
                           pd->dst.stride, hdist, vdist) |
-         dct_vs_dst(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
+         dct_vs_idtx(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
 }
 
 #endif  // CONFIG_EXT_TX
@@ -653,13 +653,13 @@
     FLIPADST_1D,
     ADST_1D,
     FLIPADST_1D,
-    DST_1D,
+    IDTX_1D,
     DCT_1D,
-    DST_1D,
+    IDTX_1D,
     ADST_1D,
-    DST_1D,
+    IDTX_1D,
     FLIPADST_1D,
-    DST_1D,
+    IDTX_1D,
   };
   static TX_TYPE_1D htx_tab[TX_TYPES] = {
     DCT_1D,
@@ -671,16 +671,14 @@
     FLIPADST_1D,
     FLIPADST_1D,
     ADST_1D,
+    IDTX_1D,
+    IDTX_1D,
     DCT_1D,
-    DST_1D,
+    IDTX_1D,
     ADST_1D,
-    DST_1D,
+    IDTX_1D,
     FLIPADST_1D,
-    DST_1D,
-    DST_1D,
   };
-  if (tx_type >= IDTX)
-    return 1;
   return !(((prune >> vtx_tab[tx_type]) & 1) |
          ((prune >> (htx_tab[tx_type] + 8)) & 1));
 #else
@@ -5978,15 +5976,18 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    vp10_find_best_masked_sub_pixel_tree(x, mask, mask_stride,
-                                         &tmp_mv->as_mv, &ref_mv,
-                                         cm->allow_high_precision_mv,
-                                         x->errorperbit,
-                                         &cpi->fn_ptr[bsize],
-                                         cpi->sf.mv.subpel_force_stop,
-                                         cpi->sf.mv.subpel_iters_per_step,
-                                         x->nmvjointcost, x->mvcost,
-                                         &dis, &x->pred_sse[ref], ref_idx);
+    vp10_find_best_masked_sub_pixel_tree_up(cpi, x, mask, mask_stride,
+                                            mi_row, mi_col,
+                                            &tmp_mv->as_mv, &ref_mv,
+                                            cm->allow_high_precision_mv,
+                                            x->errorperbit,
+                                            &cpi->fn_ptr[bsize],
+                                            cpi->sf.mv.subpel_force_stop,
+                                            cpi->sf.mv.subpel_iters_per_step,
+                                            x->nmvjointcost, x->mvcost,
+                                            &dis, &x->pred_sse[ref],
+                                            ref_idx,
+                                            cpi->sf.use_upsampled_references);
   }
   *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 90c8bed..e6be1dd 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -433,7 +433,7 @@
   return *sse; \
 }
 
-void highbd_var_filter_block2d_bil_first_pass(
+void vpx_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -459,7 +459,7 @@
   }
 }
 
-void highbd_var_filter_block2d_bil_second_pass(
+void vpx_highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -492,13 +492,14 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse); \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
@@ -509,10 +510,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -526,10 +528,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -546,16 +549,17 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
                              CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse);           \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
@@ -568,10 +572,11 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
                              CONVERT_TO_BYTEPTR(temp2), W); \
@@ -590,10 +595,11 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
                              CONVERT_TO_BYTEPTR(temp2), W); \
@@ -914,11 +920,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                  W, dst, dst_stride, \
@@ -934,11 +940,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
@@ -954,11 +960,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 4ad23f8..1759854 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -130,7 +130,7 @@
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
-void highbd_var_filter_block2d_bil_first_pass(
+void vpx_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -139,7 +139,7 @@
     unsigned int output_width,
     const uint8_t *filter);
 
-void highbd_var_filter_block2d_bil_second_pass(
+void vpx_highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
index 18ecc7e..5c1dfe4 100644
--- a/vpx_dsp/x86/highbd_variance_sse4.c
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -119,10 +119,12 @@
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
   return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
                                   4, dst, dst_stride, sse);
@@ -137,10 +139,12 @@
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
   return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
                                    4, dst, dst_stride, sse);
@@ -155,10 +159,12 @@
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
   return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
                                    4, dst, dst_stride, sse);
@@ -177,13 +183,15 @@
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
-                             CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
                                   4, dst, dst_stride, sse);
@@ -200,13 +208,15 @@
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
-                             CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
                                    4, dst, dst_stride, sse);
@@ -223,13 +233,15 @@
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
-                                           4, bilinear_filters_2t[xoffset]);
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                            bilinear_filters_2t[yoffset]);
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
-                             CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
 
   return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
                                    4, dst, dst_stride, sse);