Reorder functions in mcomp.[c|h]

Change-Id: I5a27005d9b96af2f6bb9de874fb8523e93c45b7c
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index a46d228..c447834 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -56,6 +56,7 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/partition_model_weights.h"
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index b18f7a4..84a21b7 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -19,7 +19,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/common.h"
 #include "av1/common/mvref_common.h"
@@ -29,7 +28,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
-#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/reconinter_enc.h"
 
@@ -124,6 +122,10 @@
   return sr;
 }
 
+// ============================================================================
+//  Cost of motion vectors
+// ============================================================================
+
 // Returns the rate of encoding the current motion vector based on the
 // joint_cost and comp_cost. joint_costs covers the cost of transmitting
 // JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
@@ -209,6 +211,13 @@
   }
 }
 
+// =============================================================================
+//  Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
 void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int ss_count = 0;
   int stage_index = MAX_MVSEARCH_STEPS - 1;
@@ -326,1033 +335,7 @@
   cfg->ss_count = ss_count;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
-// convert motion vector component to offset for sv[a]f calc
-static INLINE int sp(int x) { return x & 7; }
-
-static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
-  const int offset = (r >> 3) * stride + (c >> 3);
-  return buf + offset;
-}
-
-#define UNPACK_VAR_PARAMS(var_params)                     \
-  const aom_variance_fn_ptr_t *vfp = (var_params)->vfp;   \
-  const SUBPEL_SEARCH_TYPE subpel_search_type =           \
-      (var_params)->subpel_search_type;                   \
-  const uint8_t *second_pred = (var_params)->second_pred; \
-  const uint8_t *mask = (var_params)->mask;               \
-  const int mask_stride = (var_params)->mask_stride;      \
-  const int invert_mask = (var_params)->invert_mask;      \
-  const int w = (var_params)->w;                          \
-  const int h = (var_params)->h;
-
-static INLINE int estimated_pref_error(
-    const MV *this_mv, const uint8_t *src, const int src_stride,
-    const uint8_t *ref, int ref_stride,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) {
-  UNPACK_VAR_PARAMS(var_params);
-  (void)subpel_search_type;
-  (void)w;
-  (void)h;
-  const int r = this_mv->row;
-  const int c = this_mv->col;
-  if (second_pred == NULL) {
-    return vfp->svf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
-                    src_stride, sse);
-  } else if (mask) {
-    return vfp->msvf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
-                     src_stride, second_pred, mask, mask_stride, invert_mask,
-                     sse);
-  } else {
-    return vfp->svaf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
-                     src_stride, sse, second_pred);
-  }
-}
-
-static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
-                                const MV *this_mv, const uint8_t *src,
-                                int src_stride, const uint8_t *ref,
-                                int ref_stride,
-                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-                                unsigned int *sse) {
-  UNPACK_VAR_PARAMS(var_params);
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  ref = pre(ref, ref_stride, this_mv->row, this_mv->col);
-  const int subpel_x_q3 = sp(this_mv->col);
-  const int subpel_y_q3 = sp(this_mv->row);
-  unsigned int besterr;
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_highbd_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
-            invert_mask, xd->bd, subpel_search_type);
-      } else {
-        aom_highbd_comp_avg_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
-            subpel_search_type);
-      }
-    } else {
-      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
-                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                                xd->bd, subpel_search_type);
-    }
-    besterr = vfp->vf(pred8, w, src, src_stride, sse);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
-            invert_mask, subpel_search_type);
-      } else {
-        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
-                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
-                                    ref, ref_stride, subpel_search_type);
-      }
-    } else {
-      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search_type);
-    }
-
-    besterr = vfp->vf(pred, w, src, src_stride, sse);
-  }
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  if (second_pred != NULL) {
-    if (mask) {
-      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
-                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
-                                   ref, ref_stride, mask, mask_stride,
-                                   invert_mask, subpel_search_type);
-    } else {
-      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
-                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
-                                  ref, ref_stride, subpel_search_type);
-    }
-  } else {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, ref, ref_stride, subpel_search_type);
-  }
-
-  besterr = vfp->vf(pred, w, src, src_stride, sse);
-#endif
-  return besterr;
-}
-
-static INLINE unsigned int check_better_fast(
-    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
-  unsigned int cost;
-  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
-    unsigned int sse;
-    int thismse = estimated_pref_error(this_mv, src, src_stride, ref,
-                                       ref_stride, var_params, &sse);
-    cost = mv_err_cost_(this_mv, mv_cost_params);
-    cost += thismse;
-
-    if (cost < *besterr) {
-      *besterr = cost;
-      *best_mv = *this_mv;
-      *distortion = thismse;
-      *sse1 = sse;
-    }
-  } else {
-    cost = INT_MAX;
-  }
-  return cost;
-}
-
-static AOM_FORCE_INLINE unsigned int check_better(
-    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
-    const SubpelMvLimits *mv_limits, const uint8_t *const src,
-    const int src_stride, const uint8_t *const ref, int ref_stride,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int *is_better) {
-  unsigned int cost;
-  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
-    unsigned int sse;
-    int thismse;
-    if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
-      thismse = upsampled_pref_error(xd, cm, this_mv, src, src_stride, ref,
-                                     ref_stride, var_params, &sse);
-    } else {
-      thismse = estimated_pref_error(this_mv, src, src_stride, ref, ref_stride,
-                                     var_params, &sse);
-    }
-    cost = mv_err_cost_(this_mv, mv_cost_params);
-    cost += thismse;
-    if (cost < *besterr) {
-      *besterr = cost;
-      *best_mv = *this_mv;
-      *distortion = thismse;
-      *sse1 = sse;
-      *is_better |= 1;
-    }
-  } else {
-    cost = INT_MAX;
-  }
-  return cost;
-}
-
-static AOM_FORCE_INLINE int first_level_check_fast(
-    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
-  // Check the four cardinal directions
-  const MV left_mv = { this_mv->row, this_mv->col - hstep };
-  const unsigned int left = check_better_fast(
-      &left_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
-      var_params, mv_cost_params, besterr, sse1, distortion);
-
-  const MV right_mv = { this_mv->row, this_mv->col + hstep };
-  const unsigned int right = check_better_fast(
-      &right_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
-      var_params, mv_cost_params, besterr, sse1, distortion);
-
-  const MV top_mv = { this_mv->row - hstep, this_mv->col };
-  const unsigned int up = check_better_fast(
-      &top_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride, var_params,
-      mv_cost_params, besterr, sse1, distortion);
-
-  const MV bottom_mv = { this_mv->row + hstep, this_mv->col };
-  const unsigned int down = check_better_fast(
-      &bottom_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
-      var_params, mv_cost_params, besterr, sse1, distortion);
-
-  // Check the diagonal direction with the best mv
-  const int whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-  switch (whichdir) {
-    case 0: {
-      const MV top_left_mv = { this_mv->row - hstep, this_mv->col - hstep };
-      check_better_fast(&top_left_mv, best_mv, mv_limits, src, src_stride, ref,
-                        ref_stride, var_params, mv_cost_params, besterr, sse1,
-                        distortion);
-      break;
-    }
-    case 1: {
-      const MV top_right_mv = { this_mv->row - hstep, this_mv->col + hstep };
-      check_better_fast(&top_right_mv, best_mv, mv_limits, src, src_stride, ref,
-                        ref_stride, var_params, mv_cost_params, besterr, sse1,
-                        distortion);
-      break;
-    }
-    case 2: {
-      const MV bottom_left_mv = { this_mv->row + hstep, this_mv->col - hstep };
-      check_better_fast(&bottom_left_mv, best_mv, mv_limits, src, src_stride,
-                        ref, ref_stride, var_params, mv_cost_params, besterr,
-                        sse1, distortion);
-      break;
-    }
-    case 3: {
-      const MV bottom_right_mv = { this_mv->row + hstep, this_mv->col + hstep };
-      check_better_fast(&bottom_right_mv, best_mv, mv_limits, src, src_stride,
-                        ref, ref_stride, var_params, mv_cost_params, besterr,
-                        sse1, distortion);
-      break;
-    }
-  }
-  return whichdir;
-}
-
-static AOM_FORCE_INLINE void second_level_check_fast(
-    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int whichdir) {
-  const int tr = this_mv->row;
-  const int tc = this_mv->col;
-  const int br = best_mv->row;
-  const int bc = best_mv->col;
-  if (tr != br && tc != bc) {
-    const int kr = br - tr;
-    const int kc = bc - tc;
-
-    const MV chess_mv_1 = { tr + kr, tc + 2 * kc };
-    check_better_fast(&chess_mv_1, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-
-    const MV chess_mv_2 = { tr + 2 * kr, tc + kc };
-    check_better_fast(&chess_mv_2, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-  } else if (tr == br && tc != bc) {
-    const int kc = bc - tc;
-    const MV bottom_long_mv = { tr + hstep, tc + 2 * kc };
-    check_better_fast(&bottom_long_mv, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-    const MV top_long_mv = { tr - hstep, tc + 2 * kc };
-    check_better_fast(&top_long_mv, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-
-    switch (whichdir) {
-      case 0:
-      case 1: {
-        const MV bottom_mv = { tr + hstep, tc + kc };
-        check_better_fast(&bottom_mv, best_mv, mv_limits, src, src_stride, ref,
-                          ref_stride, var_params, mv_cost_params, besterr, sse1,
-                          distortion);
-        break;
-      }
-      case 2:
-      case 3: {
-        const MV top_mv = { tr - hstep, tc + kc };
-        check_better_fast(&top_mv, best_mv, mv_limits, src, src_stride, ref,
-                          ref_stride, var_params, mv_cost_params, besterr, sse1,
-                          distortion);
-        break;
-      }
-    }
-  } else if (tr != br && tc == bc) {
-    const int kr = br - tr;
-    const MV right_long_mv = { tr + 2 * kr, tc + hstep };
-    check_better_fast(&right_long_mv, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-    const MV left_long_mv = { tr + 2 * kr, tc - hstep };
-    check_better_fast(&left_long_mv, best_mv, mv_limits, src, src_stride, ref,
-                      ref_stride, var_params, mv_cost_params, besterr, sse1,
-                      distortion);
-
-    switch (whichdir) {
-      case 0:
-      case 2: {
-        const MV right_mv = { tr + kr, tc + hstep };
-        check_better_fast(&right_mv, best_mv, mv_limits, src, src_stride, ref,
-                          ref_stride, var_params, mv_cost_params, besterr, sse1,
-                          distortion);
-        break;
-      }
-      case 1:
-      case 3: {
-        const MV left_mv = { tr + kr, tc - hstep };
-        check_better_fast(&left_mv, best_mv, mv_limits, src, src_stride, ref,
-                          ref_stride, var_params, mv_cost_params, besterr, sse1,
-                          distortion);
-      }
-    }
-  }
-}
-
-static AOM_FORCE_INLINE void two_level_checks_fast(
-    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion, int iters) {
-  unsigned int whichdir = first_level_check_fast(
-      this_mv, best_mv, hstep, mv_limits, src, src_stride, ref, ref_stride,
-      var_params, mv_cost_params, besterr, sse1, distortion);
-  if (iters > 1) {
-    second_level_check_fast(this_mv, best_mv, hstep, mv_limits, src, src_stride,
-                            ref, ref_stride, var_params, mv_cost_params,
-                            besterr, sse1, distortion, whichdir);
-  }
-}
-
-#define CHECK_BETTER(v, r, c)                                            \
-  {                                                                      \
-    const MV this_mv = { (r), (c) };                                     \
-    (v) = check_better_fast(&this_mv, bestmv, &mv_limits, src_address,   \
-                            src_stride, y, y_stride, var_params,         \
-                            mv_cost_params, &besterr, sse1, distortion); \
-  }
-
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-/* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                            \
-  (v) = check_better(xd, cm, (r), (c), &br, &bc, &mv_limits, src_address, \
-                     src_stride, y, y_stride, var_params, mv_cost_params, \
-                     &besterr, sse1, distortion);
-
-// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
-// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
-// later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST(k)                \
-  {                                                \
-    unsigned int second;                           \
-    int br0 = br;                                  \
-    int bc0 = bc;                                  \
-    assert(tr == br || tc == bc);                  \
-    if (tr == br && tc != bc) {                    \
-      kc = bc - tc;                                \
-    } else if (tr != br && tc == bc) {             \
-      kr = br - tr;                                \
-    }                                              \
-    CHECK_BETTER##k(second, br0 + kr, bc0);        \
-    CHECK_BETTER##k(second, br0, bc0 + kc);        \
-    if (br0 != br || bc0 != bc) {                  \
-      CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
-    }                                              \
-    (void)second;                                  \
-  }
-
-static unsigned int setup_center_error(
-    const MACROBLOCKD *xd, const MV *bestmv, const uint8_t *const src,
-    const int src_stride, const uint8_t *y, int y_stride,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
-  UNPACK_VAR_PARAMS(var_params);
-  (void)subpel_search_type;
-  unsigned int besterr;
-  y = pre(y, y_stride, bestmv->row, bestmv->col);
-
-  if (second_pred != NULL) {
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (is_cur_buf_hbd(xd)) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
-      if (mask) {
-        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
-                                  mask, mask_stride, invert_mask);
-      } else {
-        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      if (mask) {
-        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
-                           mask_stride, invert_mask);
-      } else {
-        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-    }
-#else
-    (void)xd;
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-    if (mask) {
-      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
-                         mask_stride, invert_mask);
-    } else {
-      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-    }
-    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
-#endif
-  } else {
-    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost_(bestmv, mv_cost_params);
-  return besterr;
-}
-
-static INLINE int divide_and_round(int n, int d) {
-  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
-}
-
-static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
-  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
-         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
-}
-
-// Returns surface minima estimate at given precision in 1/2^n bits.
-// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
-// For a given set of costs S0, S1, S2, S3, S4 at points
-// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
-// the solution for the location of the minima (x0, y0) is given by:
-// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
-// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
-// The code below is an integerized version of that.
-static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
-                                         int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
-}
-
-int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    MACROBLOCK *x, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
-    unsigned int *sse1) {
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int do_reset_fractional_mv = ms_params->do_reset_fractional_mv;
-  const int *cost_list = ms_params->cost_list;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-  const MV *ref_mv = mv_cost_params->ref_mv;
-  const SUBPEL_SEARCH_TYPE subpel_search_type =
-      ms_params->var_params.subpel_search_type;
-
-  const uint8_t *const src_address = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-  const int y_stride = xd->plane[0].pre[0].stride;
-
-  convert_fullmv_to_mv(&x->best_mv);
-  MV *bestmv = &x->best_mv.as_mv;
-  MV start_mv = *bestmv;
-
-  int hstep = 4;
-
-  SubpelMvLimits mv_limits;
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
-                                 mv_cost_params->ref_mv);
-
-  besterr = setup_center_error(xd, bestmv, src_address, src_stride, y, y_stride,
-                               var_params, mv_cost_params, sse1, distortion);
-  (void)halfiters;
-  (void)quarteriters;
-  (void)eighthiters;
-  (void)allow_hp;
-  (void)forced_stop;
-  (void)hstep;
-  (void)cm;
-  (void)do_reset_fractional_mv;
-  (void)ref_mv;
-  (void)subpel_search_type;
-
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    int ir, ic;
-    get_cost_surf_min(cost_list, &ir, &ic, 2);
-    if (ir != 0 || ic != 0) {
-      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
-      check_better_fast(&this_mv, bestmv, &mv_limits, src_address, src_stride,
-                        y, y_stride, var_params, mv_cost_params, &besterr, sse1,
-                        distortion);
-    }
-  } else {
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, halfiters);
-
-    // Each subsequent iteration checks at least one point in common with
-    // the last iteration could be 2 ( if diag selected) 1/4 pel
-    if (forced_stop != HALF_PEL) {
-      hstep >>= 1;
-      start_mv = *bestmv;
-      two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                            src_stride, y, y_stride, var_params, mv_cost_params,
-                            &besterr, sse1, distortion, quarteriters);
-    }
-  }
-
-  if (allow_hp && forced_stop == EIGHTH_PEL) {
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, eighthiters);
-  }
-
-  return besterr;
-}
-
-int av1_find_best_sub_pixel_tree_pruned_more(
-    MACROBLOCK *x, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
-    unsigned int *sse1) {
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int *cost_list = ms_params->cost_list;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-
-  const uint8_t *const src_address = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-  const int y_stride = xd->plane[0].pre[0].stride;
-
-  convert_fullmv_to_mv(&x->best_mv);
-  MV *bestmv = &x->best_mv.as_mv;
-  MV start_mv = *bestmv;
-
-  int hstep = 4;
-
-  SubpelMvLimits mv_limits;
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
-                                 mv_cost_params->ref_mv);
-
-  (void)cm;
-
-  besterr = setup_center_error(xd, bestmv, src_address, src_stride, y, y_stride,
-                               var_params, mv_cost_params, sse1, distortion);
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
-    int ir, ic;
-    get_cost_surf_min(cost_list, &ir, &ic, 1);
-    if (ir != 0 || ic != 0) {
-      const MV this_mv = { start_mv.row + ir * hstep,
-                           start_mv.col + ic * hstep };
-      check_better_fast(&this_mv, bestmv, &mv_limits, src_address, src_stride,
-                        y, y_stride, var_params, mv_cost_params, &besterr, sse1,
-                        distortion);
-    }
-  } else {
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, halfiters);
-  }
-
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, quarteriters);
-  }
-
-  if (allow_hp && forced_stop == EIGHTH_PEL) {
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, eighthiters);
-  }
-
-  return besterr;
-}
-
-int av1_find_best_sub_pixel_tree_pruned(
-    MACROBLOCK *x, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
-    unsigned int *sse1) {
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int *cost_list = ms_params->cost_list;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-
-  const uint8_t *const src_address = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  const unsigned int halfiters = iters_per_step;
-  const unsigned int quarteriters = iters_per_step;
-  const unsigned int eighthiters = iters_per_step;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-  const int y_stride = xd->plane[0].pre[0].stride;
-
-  convert_fullmv_to_mv(&x->best_mv);
-  MV *bestmv = &x->best_mv.as_mv;
-  MV start_mv = *bestmv;
-
-  int hstep = 4;
-
-  SubpelMvLimits mv_limits;
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
-                                 mv_cost_params->ref_mv);
-  (void)cm;
-
-  besterr = setup_center_error(xd, bestmv, src_address, src_stride, y, y_stride,
-                               var_params, mv_cost_params, sse1, distortion);
-  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
-      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
-      cost_list[4] != INT_MAX) {
-    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
-                                  (cost_list[2] < cost_list[4] ? 0 : 2);
-
-    const MV left_mv = { start_mv.row, start_mv.col - hstep };
-    const MV right_mv = { start_mv.row, start_mv.col + hstep };
-    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
-    const MV top_mv = { start_mv.row - hstep, start_mv.col };
-
-    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
-    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
-    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
-    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
-
-    switch (whichdir) {
-      case 0:  // bottom left quadrant
-        check_better_fast(&left_mv, bestmv, &mv_limits, src_address, src_stride,
-                          y, y_stride, var_params, mv_cost_params, &besterr,
-                          sse1, distortion);
-        check_better_fast(&bottom_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        check_better_fast(&bottom_left_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        break;
-      case 1:  // bottom right quadrant
-        check_better_fast(&right_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        check_better_fast(&bottom_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        check_better_fast(&bottom_right_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        break;
-      case 2:  // top left quadrant
-        check_better_fast(&left_mv, bestmv, &mv_limits, src_address, src_stride,
-                          y, y_stride, var_params, mv_cost_params, &besterr,
-                          sse1, distortion);
-        check_better_fast(&top_mv, bestmv, &mv_limits, src_address, src_stride,
-                          y, y_stride, var_params, mv_cost_params, &besterr,
-                          sse1, distortion);
-        check_better_fast(&top_left_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        break;
-      case 3:  // top right quadrant
-        check_better_fast(&right_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        check_better_fast(&top_mv, bestmv, &mv_limits, src_address, src_stride,
-                          y, y_stride, var_params, mv_cost_params, &besterr,
-                          sse1, distortion);
-        check_better_fast(&top_right_mv, bestmv, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion);
-        break;
-    }
-  } else {
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, halfiters);
-  }
-
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-  if (forced_stop != HALF_PEL) {
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, quarteriters);
-  }
-
-  if (allow_hp && forced_stop == EIGHTH_PEL) {
-    hstep >>= 1;
-    start_mv = *bestmv;
-    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
-                          src_stride, y, y_stride, var_params, mv_cost_params,
-                          &besterr, sse1, distortion, eighthiters);
-  }
-
-  return besterr;
-}
-
-/* clang-format off */
-static const MV search_step_table[12] = {
-  // left, right, up, down
-  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
-  { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
-  { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
-};
-/* clang-format on */
-
-static unsigned int upsampled_setup_center_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
-  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, src, src_stride,
-                                              y, y_stride, var_params, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost_(bestmv, mv_cost_params);
-  return besterr;
-}
-
-static AOM_FORCE_INLINE void second_level_check_v2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *diag_mv, MV *best_mv,
-    int kr, int kc, const SubpelMvLimits *mv_limits, const uint8_t *const src,
-    const int src_stride, const uint8_t *const ref, int ref_stride,
-    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
-    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
-    unsigned int *sse1, int *distortion) {
-  const MV center_mv = *best_mv;
-
-  assert(diag_mv->row == best_mv->row || diag_mv->col == best_mv->col);
-  if (best_mv->row == diag_mv->row && best_mv->col != diag_mv->col) {
-    kc = best_mv->col - diag_mv->col;
-  } else if (best_mv->row != diag_mv->row && best_mv->col == diag_mv->col) {
-    kr = best_mv->row - diag_mv->row;
-  }
-
-  const MV row_bias_mv = { center_mv.row + kr, center_mv.col };
-  const MV col_bias_mv = { center_mv.row, center_mv.col + kc };
-  const MV diag_bias_mv = { center_mv.row + kr, center_mv.col + kc };
-  int has_better_mv = 0;
-
-  check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, src, src_stride, ref,
-               ref_stride, var_params, mv_cost_params, besterr, sse1,
-               distortion, &has_better_mv);
-  check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, src, src_stride, ref,
-               ref_stride, var_params, mv_cost_params, besterr, sse1,
-               distortion, &has_better_mv);
-
-  // Do an additional search if the second iteration gives a better mv
-  if (has_better_mv) {
-    int dummy = 0;
-    check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, src, src_stride,
-                 ref, ref_stride, var_params, mv_cost_params, besterr, sse1,
-                 distortion, &dummy);
-  }
-}
-
-int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const AV1_COMMON *const cm,
-                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
-                                 int *distortion, unsigned int *sse1) {
-  const int allow_hp = ms_params->allow_hp;
-  const int forced_stop = ms_params->forced_stop;
-  const int iters_per_step = ms_params->iters_per_step;
-  const int do_reset_fractional_mv = ms_params->do_reset_fractional_mv;
-  const int *cost_list = ms_params->cost_list;
-  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
-  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
-  const MV *ref_mv = mv_cost_params->ref_mv;
-  const SUBPEL_SEARCH_TYPE subpel_search_type =
-      ms_params->var_params.subpel_search_type;
-
-  const uint8_t *const src_address = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  const int y_stride = xd->plane[0].pre[0].stride;
-
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-  convert_fullmv_to_mv(&x->best_mv);
-  MV *bestmv = &x->best_mv.as_mv;
-
-  int hstep = 4;
-  int iter, round = FULL_PEL - forced_stop;
-  const MV *search_step = search_step_table;
-  int best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-  SubpelMvLimits mv_limits;
-
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, ref_mv);
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  if (subpel_search_type != USE_2_TAPS_ORIG)
-    besterr = upsampled_setup_center_error(xd, cm, bestmv, src_address,
-                                           src_stride, y, y_stride, var_params,
-                                           mv_cost_params, sse1, distortion);
-  else
-    besterr =
-        setup_center_error(xd, bestmv, src_address, src_stride, y, y_stride,
-                           var_params, mv_cost_params, sse1, distortion);
-
-  (void)cost_list;  // to silence compiler warning
-
-  if (do_reset_fractional_mv) {
-    av1_set_fractional_mv(x->fractional_best_mv);
-  }
-
-  MV iter_center_mv = *bestmv;
-  for (iter = 0; iter < round; ++iter) {
-    if (x->fractional_best_mv[iter].as_mv.row == iter_center_mv.row &&
-        x->fractional_best_mv[iter].as_mv.col == iter_center_mv.col)
-      return INT_MAX;
-
-    x->fractional_best_mv[iter].as_mv = iter_center_mv;
-
-    MV best_iter_mv = { INT16_MAX, INT16_MAX };
-
-    // Check vertical and horizontal sub-pixel positions.
-    for (int idx = 0; idx < 4; ++idx) {
-      const MV this_mv = { iter_center_mv.row + search_step[idx].row,
-                           iter_center_mv.col + search_step[idx].col };
-
-      int is_better = 0;
-      cost_array[idx] =
-          check_better(xd, cm, &this_mv, &best_iter_mv, &mv_limits, src_address,
-                       src_stride, y, y_stride, var_params, mv_cost_params,
-                       &besterr, sse1, distortion, &is_better);
-      if (is_better) {
-        best_idx = idx;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    const MV diag_mv = { iter_center_mv.row + kr, iter_center_mv.col + kc };
-    int is_better = 0;
-
-    cost_array[4] =
-        check_better(xd, cm, &diag_mv, &best_iter_mv, &mv_limits, src_address,
-                     src_stride, y, y_stride, var_params, mv_cost_params,
-                     &besterr, sse1, distortion, &is_better);
-    if (is_better) {
-      best_idx = 4;
-    }
-
-    if (best_idx != -1) {
-      iter_center_mv = best_iter_mv;
-
-      if (iters_per_step > 1) {
-        second_level_check_v2(xd, cm, &diag_mv, &iter_center_mv, kr, kc,
-                              &mv_limits, src_address, src_stride, y, y_stride,
-                              var_params, mv_cost_params, &besterr, sse1,
-                              distortion);
-      }
-    }
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  *bestmv = iter_center_mv;
-
-  return besterr;
-}
-
-#undef PRE
-#undef CHECK_BETTER
-
-unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                     BLOCK_SIZE bsize, const MV *this_mv) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const uint8_t *const src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t *const dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
-  const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
-  const int_mv ref_mv = av1_get_ref_mv(x, 0);
-  unsigned int mse;
-  unsigned int sse;
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
-
-  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                AOM_PLANE_Y, AOM_PLANE_Y);
-  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
-  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
-                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
-                     mv_cost_type);
-  return mse;
-}
-
-// Refine MV in a small range
-unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
-                                  BLOCK_SIZE bsize, int *pts0, int *pts_inref0,
-                                  int total_samples) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
-                            { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
-  const int_mv ref_mv = av1_get_ref_mv(x, 0);
-  int16_t br = mbmi->mv[0].as_mv.row;
-  int16_t bc = mbmi->mv[0].as_mv.col;
-  int16_t *tr = &mbmi->mv[0].as_mv.row;
-  int16_t *tc = &mbmi->mv[0].as_mv.col;
-  WarpedMotionParams best_wm_params = mbmi->wm_params;
-  int best_num_proj_ref = mbmi->num_proj_ref;
-  unsigned int bestmse;
-  SubpelMvLimits mv_limits;
-
-  const int start = cm->allow_high_precision_mv ? 0 : 4;
-  int ite;
-
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
-
-  // Calculate the center position's error
-  assert(av1_is_subpelmv_in_range(&mv_limits, mbmi->mv[0].as_mv));
-  bestmse = av1_compute_motion_cost(cpi, x, bsize, &mbmi->mv[0].as_mv);
-
-  // MV search
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  for (ite = 0; ite < 2; ++ite) {
-    int best_idx = -1;
-    int idx;
-
-    for (idx = start; idx < start + 4; ++idx) {
-      unsigned int thismse;
-
-      *tr = br + neighbors[idx].row;
-      *tc = bc + neighbors[idx].col;
-
-      MV this_mv = { *tr, *tc };
-      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
-        int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
-
-        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
-        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
-        if (total_samples > 1)
-          mbmi->num_proj_ref =
-              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
-
-        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
-                                 *tc, &mbmi->wm_params, mi_row, mi_col)) {
-          thismse = av1_compute_motion_cost(cpi, x, bsize, &this_mv);
-
-          if (thismse < bestmse) {
-            best_idx = idx;
-            best_wm_params = mbmi->wm_params;
-            best_num_proj_ref = mbmi->num_proj_ref;
-            bestmse = thismse;
-          }
-        }
-      }
-    }
-
-    if (best_idx == -1) break;
-
-    if (best_idx >= 0) {
-      br += neighbors[best_idx].row;
-      bc += neighbors[best_idx].col;
-    }
-  }
-
-  *tr = br;
-  *tc = bc;
-  mbmi->wm_params = best_wm_params;
-  mbmi->num_proj_ref = best_num_proj_ref;
-  return bestmse;
-}
-
+// Checks whether the mv is within range of the mv_limits
 static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
                                int range) {
   return ((row - range) >= mv_limits->row_min) &
@@ -1361,23 +344,9 @@
          ((col + range) <= mv_limits->col_max);
 }
 
-#define CHECK_BETTER                                                       \
-  {                                                                        \
-    if (thissad < bestsad) {                                               \
-      if (use_mvcost)                                                      \
-        thissad += mvsad_err_cost(x, &this_mv, &full_ref_mv, sad_per_bit); \
-      if (thissad < bestsad) {                                             \
-        bestsad = thissad;                                                 \
-        best_site = i;                                                     \
-      }                                                                    \
-    }                                                                      \
-  }
-
-#define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
-#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
-
-// Calculate and return a sad+mvcost list around an integer best pel.
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
 static INLINE void calc_int_cost_list(const MACROBLOCK *x,
                                       const MV *const ref_mv, int sadpb,
                                       const aom_variance_fn_ptr_t *fn_ptr,
@@ -1475,6 +444,16 @@
   }
 }
 
+#define CHECK_BETTER                                                     \
+  if (thissad < bestsad) {                                               \
+    if (use_mvcost)                                                      \
+      thissad += mvsad_err_cost(x, &this_mv, &full_ref_mv, sad_per_bit); \
+    if (thissad < bestsad) {                                             \
+      bestsad = thissad;                                                 \
+      best_site = i;                                                     \
+    }                                                                    \
+  }
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
@@ -1732,78 +711,7 @@
   x->best_mv.as_mv.col = bc;
   return bestsad;
 }
-
-int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
-  unsigned int sse, var;
-
-  var = vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                in_what->stride, &sse);
-  (void)var;
-
-  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
-                           x->errorperbit, mv_cost_type);
-}
-
-int av1_get_mvpred_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
-  unsigned int sse, var;
-
-  var = vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                in_what->stride, &sse);
-
-  return var + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
-                           x->errorperbit, mv_cost_type);
-}
-
-int av1_get_mvpred_av_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                          const MV *ref_mv, const uint8_t *second_pred,
-                          const aom_variance_fn_ptr_t *vfp,
-                          const struct buf_2d *src, const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
-  unsigned int unused;
-
-  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
-                   what->buf, what->stride, &unused, second_pred) +
-         mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
-                     mv_cost_type);
-}
-
-int av1_get_mvpred_mask_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
-                            const MV *ref_mv, const uint8_t *second_pred,
-                            const uint8_t *mask, int mask_stride,
-                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
-                            const struct buf_2d *src,
-                            const struct buf_2d *pre) {
-  const struct buf_2d *const what = src;
-  const struct buf_2d *const in_what = pre;
-  const MV mv = get_mv_from_fullmv(best_mv);
-  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
-  unsigned int unused;
-
-  return vfp->msvf(what->buf, what->stride, 0, 0,
-                   get_buf_from_mv(in_what, best_mv), in_what->stride,
-                   second_pred, mask, mask_stride, invert_mask, &unused) +
-         mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
-                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
-                     mv_cost_type);
-}
+#undef CHECK_BETTER
 
 // For the following foo_search, the input arguments are:
 // x: The struct used to hold a bunch of random configs.
@@ -1818,7 +726,7 @@
 //   speed up subpel search later.
 // vfp: a function pointer to the simd function so we can compute the cost
 //   efficiently
-// ref_mf: the reference mv used to compute the mv cost
+// ref_mv: the reference mv used to compute the mv cost
 int av1_hex_search(MACROBLOCK *x, FULLPEL_MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, const MV *ref_mv) {
@@ -1953,8 +861,6 @@
                        sad_per_bit, do_init_search, cost_list, vfp, ref_mv);
 }
 
-#undef CHECK_BETTER
-
 // Exhaustive motion search around a given centre position with a given
 // step size.
 static int exhuastive_mesh_search(MACROBLOCK *x, FULLPEL_MV *ref_mv,
@@ -2400,212 +1306,6 @@
   return best_sad;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
-  int best_sad = INT_MAX;
-  int this_sad;
-  int d;
-  int center, offset = 0;
-  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
-  for (d = 0; d <= bw; d += 16) {
-    this_sad = aom_vector_var(&ref[d], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      offset = d;
-    }
-  }
-  center = offset;
-
-  for (d = -8; d <= 8; d += 16) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -4; d <= 4; d += 8) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -2; d <= 2; d += 4) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -1; d <= 1; d += 2) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw) continue;
-    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-
-  return (center - (bw >> 1));
-}
-
-static const MV search_pos[4] = {
-  { -1, 0 },
-  { 0, -1 },
-  { 0, 1 },
-  { 1, 0 },
-};
-
-unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
-                                           BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col, const MV *ref_mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mi = xd->mi[0];
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
-  int idx;
-  const int bw = 4 << mi_size_wide_log2[bsize];
-  const int bh = 4 << mi_size_high_log2[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
-  const int src_stride = x->plane[0].src.stride;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  uint8_t const *ref_buf, *src_buf;
-  int_mv *best_int_mv = &xd->mi[0]->mv[0];
-  unsigned int best_sad, tmp_sad, this_sad[4];
-  const int norm_factor = 3 + (bw >> 5);
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
-
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
-    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
-                         MAX_MB_PLANE);
-  }
-
-  if (xd->bd != 8) {
-    unsigned int sad;
-    best_int_mv->as_fullmv = kZeroFullMv;
-    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                 xd->plane[0].pre[0].buf, ref_stride);
-
-    if (scaled_ref_frame) {
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-    }
-    return sad;
-  }
-
-  // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
-
-  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
-    ref_buf += ref_stride;
-  }
-
-  // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
-  src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
-    src_buf += src_stride;
-  }
-
-  // Find the best match per 1-D search
-  best_int_mv->as_fullmv.col =
-      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
-  best_int_mv->as_fullmv.row =
-      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
-
-  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
-  src_buf = x->plane[0].src.buf;
-  ref_buf = get_buf_from_mv(&xd->plane[0].pre[0], &this_mv);
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-
-  {
-    const uint8_t *const pos[4] = {
-      ref_buf - ref_stride,
-      ref_buf - 1,
-      ref_buf + 1,
-      ref_buf + ref_stride,
-    };
-
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
-  }
-
-  for (idx = 0; idx < 4; ++idx) {
-    if (this_sad[idx] < best_sad) {
-      best_sad = this_sad[idx];
-      best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
-      best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
-    }
-  }
-
-  if (this_sad[0] < this_sad[3])
-    this_mv.row -= 1;
-  else
-    this_mv.row += 1;
-
-  if (this_sad[1] < this_sad[2])
-    this_mv.col -= 1;
-  else
-    this_mv.col += 1;
-
-  ref_buf = get_buf_from_mv(&xd->plane[0].pre[0], &this_mv);
-
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
-  if (best_sad > tmp_sad) {
-    best_int_mv->as_fullmv = this_mv;
-    best_sad = tmp_sad;
-  }
-
-  convert_fullmv_to_mv(best_int_mv);
-
-  SubpelMvLimits subpel_mv_limits;
-  av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
-  clamp_mv(&best_int_mv->as_mv, &subpel_mv_limits);
-
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
-  }
-
-  return best_sad;
-}
-
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           FULLPEL_MV *start_mv, int step_param, int method,
                           int run_mesh_search, int error_per_bit,
@@ -2763,291 +1463,9 @@
   return var;
 }
 
-/* returns subpixel variance error function */
-#define DIST(r, c)                                                             \
-  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), src_address, mask, \
-            &sse)
-
-/* checks if (r, c) has better score than previous best */
-#define MVC(diff_mv)                                                          \
-  (unsigned int)(mvcost                                                       \
-                     ? (mv_cost((diff_mv), mvjcost, mvcost) * error_per_bit + \
-                        4096) >>                                              \
-                           13                                                 \
-                     : 0)
-
-#define CHECK_BETTER(v, r, c)                                  \
-  {                                                            \
-    const MV this_mv = { r, c };                               \
-    if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {       \
-      const MV diff_mv = { r - ref_mv->row, c - ref_mv->col }; \
-      thismse = (DIST(r, c));                                  \
-      if ((v = MVC(&diff_mv) + thismse) < besterr) {           \
-        besterr = v;                                           \
-        br = r;                                                \
-        bc = c;                                                \
-        *distortion = thismse;                                 \
-        *sse1 = sse;                                           \
-      }                                                        \
-    } else {                                                   \
-      v = INT_MAX;                                             \
-    }                                                          \
-  }
-
-#undef CHECK_BETTER0
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-#undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                              \
-  {                                                                         \
-    const MV this_mv = { r, c };                                            \
-    if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {                    \
-      thismse = upsampled_obmc_pref_error(                                  \
-          xd, cm, &this_mv, mask, vfp, src_address, pre(y, y_stride, r, c), \
-          y_stride, sp(c), sp(r), w, h, &sse, subpel_search_type);          \
-      v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,     \
-                      mv_cost_type);                                        \
-      if ((v + thismse) < besterr) {                                        \
-        besterr = v + thismse;                                              \
-        br = r;                                                             \
-        bc = c;                                                             \
-        *distortion = thismse;                                              \
-        *sse1 = sse;                                                        \
-      }                                                                     \
-    } else {                                                                \
-      v = INT_MAX;                                                          \
-    }                                                                       \
-  }
-
-static unsigned int setup_obmc_center_error(
-    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
-    const uint8_t *const y, int y_stride, const int *mvjcost,
-    const int *const mvcost[2], unsigned int *sse1, int *distortion,
-    MV_COST_TYPE mv_cost_type) {
-  unsigned int besterr;
-  besterr = vfp->ovf(y, y_stride, wsrc, mask, sse1);
-  *distortion = besterr;
-  besterr +=
-      mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit, mv_cost_type);
-  return besterr;
-}
-
-static int upsampled_obmc_pref_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *const mv,
-    const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
-    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
-    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
-    int subpel_search) {
-  unsigned int besterr;
-
-  const int mi_row = xd->mi_row;
-  const int mi_col = xd->mi_col;
-  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
-    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
-                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
-                              subpel_search);
-    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
-  } else {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, y, y_stride, subpel_search);
-
-    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-  }
-#else
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                     subpel_y_q3, y, y_stride, subpel_search);
-
-  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-#endif
-  return besterr;
-}
-
-static unsigned int upsampled_setup_obmc_center_error(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, const int32_t *mask,
-    const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
-    const uint8_t *const y, int y_stride, int w, int h, const int *mvjcost,
-    const int *const mvcost[2], unsigned int *sse1, int *distortion,
-    int subpel_search, MV_COST_TYPE mv_cost_type) {
-  unsigned int besterr =
-      upsampled_obmc_pref_error(xd, cm, bestmv, mask, vfp, wsrc, y, y_stride, 0,
-                                0, w, h, sse1, subpel_search);
-  *distortion = besterr;
-  besterr +=
-      mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit, mv_cost_type);
-  return besterr;
-}
-
-#define UNPACK_OBMC_MS_PARAMS                                               \
-  const int allow_hp = ms_params->allow_hp;                                 \
-  const int forced_stop = ms_params->forced_stop;                           \
-  const int iters_per_step = ms_params->iters_per_step;                     \
-  const MV *ref_mv = ms_params->mv_cost_params.ref_mv;                      \
-  const int *mvjcost = ms_params->mv_cost_params.mvjcost;                   \
-  const int *const *mvcost = ms_params->mv_cost_params.mvcost;              \
-  const int error_per_bit = ms_params->mv_cost_params.error_per_bit;        \
-  const MV_COST_TYPE mv_cost_type = ms_params->mv_cost_params.mv_cost_type; \
-  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;             \
-  const SUBPEL_SEARCH_TYPE subpel_search_type =                             \
-      ms_params->var_params.subpel_search_type;                             \
-  const int w = ms_params->var_params.w;                                    \
-  const int h = ms_params->var_params.h;
-
-int av1_find_best_obmc_sub_pixel_tree_up(
-    MACROBLOCK *x, const AV1_COMMON *const cm,
-    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
-    unsigned int *sse1) {
-  UNPACK_OBMC_MS_PARAMS;
-  const int32_t *wsrc = x->wsrc_buf;
-  const int32_t *mask = x->mask_buf;
-
-  const int32_t *const src_address = wsrc;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int thismse;
-  const int y_stride = pd->pre[0].stride;
-  const int offset = get_offset_from_mv(&x->best_mv.as_fullmv, y_stride);
-  const uint8_t *y = pd->pre[0].buf;
-  convert_fullmv_to_mv(&x->best_mv);
-  MV *bestmv = &x->best_mv.as_mv;
-
-  int br = bestmv->row;
-  int bc = bestmv->col;
-  int hstep = 4;
-  int iter, round = FULL_PEL - forced_stop;
-  int tr = br;
-  int tc = bc;
-  const MV *search_step = search_step_table;
-  int idx, best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-
-  SubpelMvLimits mv_limits;
-
-  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, ref_mv);
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  if (subpel_search_type != USE_2_TAPS_ORIG)
-    besterr = upsampled_setup_obmc_center_error(
-        xd, cm, mask, bestmv, ref_mv, error_per_bit, vfp, src_address,
-        y + offset, y_stride, w, h, mvjcost, mvcost, sse1, distortion,
-        subpel_search_type, mv_cost_type);
-  else
-    besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
-                                      src_address, y, y_stride, mvjcost, mvcost,
-                                      sse1, distortion, mv_cost_type);
-
-  for (iter = 0; iter < round; ++iter) {
-    // Check vertical and horizontal sub-pixel positions.
-    for (idx = 0; idx < 4; ++idx) {
-      tr = br + search_step[idx].row;
-      tc = bc + search_step[idx].col;
-      MV this_mv = { tr, tc };
-      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
-        if (subpel_search_type != USE_2_TAPS_ORIG) {
-          thismse = upsampled_obmc_pref_error(
-              xd, cm, &this_mv, mask, vfp, src_address,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
-              subpel_search_type);
-        } else {
-          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
-                              sp(tr), src_address, mask, &sse);
-        }
-
-        cost_array[idx] =
-            thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                  error_per_bit, mv_cost_type);
-        if (cost_array[idx] < besterr) {
-          best_idx = idx;
-          besterr = cost_array[idx];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    tc = bc + kc;
-    tr = br + kr;
-    {
-      MV this_mv = { tr, tc };
-      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
-        if (subpel_search_type != USE_2_TAPS_ORIG) {
-          thismse = upsampled_obmc_pref_error(
-              xd, cm, &this_mv, mask, vfp, src_address,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
-              subpel_search_type);
-        } else {
-          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
-                              sp(tr), src_address, mask, &sse);
-        }
-
-        cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                              error_per_bit, mv_cost_type);
-
-        if (cost_array[4] < besterr) {
-          best_idx = 4;
-          besterr = cost_array[4];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    if (best_idx < 4 && best_idx >= 0) {
-      br += search_step[best_idx].row;
-      bc += search_step[best_idx].col;
-    } else if (best_idx == 4) {
-      br = tr;
-      bc = tc;
-    }
-
-    if (iters_per_step > 1 && best_idx != -1) {
-      if (subpel_search_type != USE_2_TAPS_ORIG) {
-        SECOND_LEVEL_CHECKS_BEST(1);
-      } else {
-        SECOND_LEVEL_CHECKS_BEST(0);
-      }
-    }
-
-    tr = br;
-    tc = bc;
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-#undef DIST
-#undef MVC
-#undef CHECK_BETTER
-
+// =============================================================================
+//  Fullpixel Motion Search: OBMC
+// =============================================================================
 static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
                                const int32_t *mask, const FULLPEL_MV *best_mv,
                                const MV *ref_mv,
@@ -3252,10 +1670,1152 @@
   }
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+// A special fast version of motion search used in rt mode
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, const MV *ref_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
+  int idx;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  int_mv *best_int_mv = &xd->mi[0]->mv[0];
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  static const MV search_pos[4] = {
+    { -1, 0 },
+    { 0, -1 },
+    { 0, 1 },
+    { 1, 0 },
+  };
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         MAX_MB_PLANE);
+  }
+
+  if (xd->bd != 8) {
+    unsigned int sad;
+    best_int_mv->as_fullmv = kZeroFullMv;
+    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                 xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return sad;
+  }
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  best_int_mv->as_fullmv.col =
+      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+  best_int_mv->as_fullmv.row =
+      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+
+  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = get_buf_from_mv(&xd->plane[0].pre[0], &this_mv);
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+      best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = get_buf_from_mv(&xd->plane[0].pre[0], &this_mv);
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    best_int_mv->as_fullmv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  convert_fullmv_to_mv(best_int_mv);
+
+  SubpelMvLimits subpel_mv_limits;
+  av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(&best_int_mv->as_mv, &subpel_mv_limits);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
+// =============================================================================
+//  Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int sp(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  const int offset = (r >> 3) * stride + (c >> 3);
+  return buf + offset;
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+    const MV *this_mv, const uint8_t *src, const int src_stride,
+    const uint8_t *ref, int ref_stride,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const uint8_t *second_pred = var_params->second_pred;
+  const uint8_t *mask = var_params->mask;
+  const int mask_stride = var_params->mask_stride;
+  const int invert_mask = var_params->invert_mask;
+  const int r = this_mv->row;
+  const int c = this_mv->col;
+
+  if (second_pred == NULL) {
+    return vfp->svf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
+                    src_stride, sse);
+  } else if (mask) {
+    return vfp->msvf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
+                     src_stride, second_pred, mask, mask_stride, invert_mask,
+                     sse);
+  } else {
+    return vfp->svaf(pre(ref, ref_stride, r, c), ref_stride, sp(c), sp(r), src,
+                     src_stride, sse, second_pred);
+  }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                const MV *this_mv, const uint8_t *src,
+                                int src_stride, const uint8_t *ref,
+                                int ref_stride,
+                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+  const uint8_t *second_pred = var_params->second_pred;
+  const uint8_t *mask = var_params->mask;
+  const int mask_stride = var_params->mask_stride;
+  const int invert_mask = var_params->invert_mask;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int subpel_x_q3 = sp(this_mv->col);
+  const int subpel_y_q3 = sp(this_mv->row);
+  unsigned int besterr;
+  ref = pre(ref, ref_stride, this_mv->row, this_mv->col);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_highbd_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, xd->bd, subpel_search_type);
+      } else {
+        aom_highbd_comp_avg_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+            subpel_search_type);
+      }
+    } else {
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                                xd->bd, subpel_search_type);
+    }
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, subpel_search_type);
+      } else {
+        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                    ref, ref_stride, subpel_search_type);
+      }
+    } else {
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search_type);
+    }
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  if (second_pred != NULL) {
+    if (mask) {
+      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                   ref, ref_stride, mask, mask_stride,
+                                   invert_mask, subpel_search_type);
+    } else {
+      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                  ref, ref_stride, subpel_search_type);
+    }
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+  }
+
+  besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+  return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse = estimated_pref_error(this_mv, src, src_stride, ref,
+                                       ref_stride, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const uint8_t *const src,
+    const int src_stride, const uint8_t *const ref, int ref_stride,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *is_better) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse;
+    thismse = upsampled_pref_error(xd, cm, this_mv, src, src_stride, ref,
+                                   ref_stride, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *is_better |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE int first_level_check_fast(
+    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  // Check the four cardinal directions
+  const MV left_mv = { this_mv->row, this_mv->col - hstep };
+  int dummy = 0;
+  const unsigned int left = check_better_fast(
+      &left_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
+      var_params, mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV right_mv = { this_mv->row, this_mv->col + hstep };
+  const unsigned int right = check_better_fast(
+      &right_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
+      var_params, mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV top_mv = { this_mv->row - hstep, this_mv->col };
+  const unsigned int up = check_better_fast(
+      &top_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride, var_params,
+      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV bottom_mv = { this_mv->row + hstep, this_mv->col };
+  const unsigned int down = check_better_fast(
+      &bottom_mv, best_mv, mv_limits, src, src_stride, ref, ref_stride,
+      var_params, mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  // Check the diagonal direction with the best mv
+  const int whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+  switch (whichdir) {
+    case 0: {
+      const MV top_left_mv = { this_mv->row - hstep, this_mv->col - hstep };
+      check_better_fast(&top_left_mv, best_mv, mv_limits, src, src_stride, ref,
+                        ref_stride, var_params, mv_cost_params, besterr, sse1,
+                        distortion, &dummy);
+      break;
+    }
+    case 1: {
+      const MV top_right_mv = { this_mv->row - hstep, this_mv->col + hstep };
+      check_better_fast(&top_right_mv, best_mv, mv_limits, src, src_stride, ref,
+                        ref_stride, var_params, mv_cost_params, besterr, sse1,
+                        distortion, &dummy);
+      break;
+    }
+    case 2: {
+      const MV bottom_left_mv = { this_mv->row + hstep, this_mv->col - hstep };
+      check_better_fast(&bottom_left_mv, best_mv, mv_limits, src, src_stride,
+                        ref, ref_stride, var_params, mv_cost_params, besterr,
+                        sse1, distortion, &dummy);
+      break;
+    }
+    case 3: {
+      const MV bottom_right_mv = { this_mv->row + hstep, this_mv->col + hstep };
+      check_better_fast(&bottom_right_mv, best_mv, mv_limits, src, src_stride,
+                        ref, ref_stride, var_params, mv_cost_params, besterr,
+                        sse1, distortion, &dummy);
+      break;
+    }
+  }
+  return whichdir;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int whichdir) {
+  const int tr = this_mv->row;
+  const int tc = this_mv->col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int dummy = 0;
+  if (tr != br && tc != bc) {
+    const int kr = br - tr;
+    const int kc = bc - tc;
+
+    const MV chess_mv_1 = { tr + kr, tc + 2 * kc };
+    check_better_fast(&chess_mv_1, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+
+    const MV chess_mv_2 = { tr + 2 * kr, tc + kc };
+    check_better_fast(&chess_mv_2, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+  } else if (tr == br && tc != bc) {
+    const int kc = bc - tc;
+    const MV bottom_long_mv = { tr + hstep, tc + 2 * kc };
+    check_better_fast(&bottom_long_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+    const MV top_long_mv = { tr - hstep, tc + 2 * kc };
+    check_better_fast(&top_long_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+
+    switch (whichdir) {
+      case 0:
+      case 1: {
+        const MV bottom_mv = { tr + hstep, tc + kc };
+        check_better_fast(&bottom_mv, best_mv, mv_limits, src, src_stride, ref,
+                          ref_stride, var_params, mv_cost_params, besterr, sse1,
+                          distortion, &dummy);
+        break;
+      }
+      case 2:
+      case 3: {
+        const MV top_mv = { tr - hstep, tc + kc };
+        check_better_fast(&top_mv, best_mv, mv_limits, src, src_stride, ref,
+                          ref_stride, var_params, mv_cost_params, besterr, sse1,
+                          distortion, &dummy);
+        break;
+      }
+    }
+  } else if (tr != br && tc == bc) {
+    const int kr = br - tr;
+    const MV right_long_mv = { tr + 2 * kr, tc + hstep };
+    check_better_fast(&right_long_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+    const MV left_long_mv = { tr + 2 * kr, tc - hstep };
+    check_better_fast(&left_long_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &dummy);
+
+    switch (whichdir) {
+      case 0:
+      case 2: {
+        const MV right_mv = { tr + kr, tc + hstep };
+        check_better_fast(&right_mv, best_mv, mv_limits, src, src_stride, ref,
+                          ref_stride, var_params, mv_cost_params, besterr, sse1,
+                          distortion, &dummy);
+        break;
+      }
+      case 1:
+      case 3: {
+        const MV left_mv = { tr + kr, tc - hstep };
+        check_better_fast(&left_mv, best_mv, mv_limits, src, src_stride, ref,
+                          ref_stride, var_params, mv_cost_params, besterr, sse1,
+                          distortion, &dummy);
+      }
+    }
+  }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+    const MV *this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    int ref_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int iters) {
+  unsigned int whichdir = first_level_check_fast(
+      this_mv, best_mv, hstep, mv_limits, src, src_stride, ref, ref_stride,
+      var_params, mv_cost_params, besterr, sse1, distortion);
+  if (iters > 1) {
+    second_level_check_fast(this_mv, best_mv, hstep, mv_limits, src, src_stride,
+                            ref, ref_stride, var_params, mv_cost_params,
+                            besterr, sse1, distortion, whichdir);
+  }
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *diag_mv, MV *best_mv,
+    int kr, int kc, const SubpelMvLimits *mv_limits, const uint8_t *const src,
+    const int src_stride, const uint8_t *const ref, int ref_stride,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  const MV center_mv = *best_mv;
+
+  assert(diag_mv->row == best_mv->row || diag_mv->col == best_mv->col);
+  if (best_mv->row == diag_mv->row && best_mv->col != diag_mv->col) {
+    kc = best_mv->col - diag_mv->col;
+  } else if (best_mv->row != diag_mv->row && best_mv->col == diag_mv->col) {
+    kr = best_mv->row - diag_mv->row;
+  }
+
+  const MV row_bias_mv = { center_mv.row + kr, center_mv.col };
+  const MV col_bias_mv = { center_mv.row, center_mv.col + kc };
+  const MV diag_bias_mv = { center_mv.row + kr, center_mv.col + kc };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, src, src_stride, ref,
+                 ref_stride, var_params, mv_cost_params, besterr, sse1,
+                 distortion, &has_better_mv);
+    check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, src, src_stride, ref,
+                 ref_stride, var_params, mv_cost_params, besterr, sse1,
+                 distortion, &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      int dummy = 0;
+      check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, src, src_stride,
+                   ref, ref_stride, var_params, mv_cost_params, besterr, sse1,
+                   distortion, &dummy);
+    }
+  } else {
+    check_better_fast(&row_bias_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &has_better_mv);
+    check_better_fast(&col_bias_mv, best_mv, mv_limits, src, src_stride, ref,
+                      ref_stride, var_params, mv_cost_params, besterr, sse1,
+                      distortion, &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      int dummy = 0;
+      check_better_fast(&diag_bias_mv, best_mv, mv_limits, src, src_stride, ref,
+                        ref_stride, var_params, mv_cost_params, besterr, sse1,
+                        distortion, &dummy);
+    }
+  }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const uint8_t *const src,
+    const int src_stride, const uint8_t *y, int y_stride,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const uint8_t *second_pred = var_params->second_pred;
+  const uint8_t *mask = var_params->mask;
+  const int mask_stride = var_params->mask_stride;
+  const int invert_mask = var_params->invert_mask;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  unsigned int besterr;
+  y = pre(y, y_stride, bestmv->row, bestmv->col);
+
+  if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+      if (mask) {
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+                                  mask, mask_stride, invert_mask);
+      } else {
+        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      if (mask) {
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                           mask_stride, invert_mask);
+      } else {
+        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+#else
+    (void)xd;
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+    if (mask) {
+      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                         mask_stride, invert_mask);
+    } else {
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+    }
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+  } else {
+    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, src, src_stride,
+                                              y, y_stride, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+                                         int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+    MACROBLOCK *x, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
+    unsigned int *sse1) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  const uint8_t *const ref_address = xd->plane[0].pre[0].buf;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+
+  convert_fullmv_to_mv(&x->best_mv);
+  MV *bestmv = &x->best_mv.as_mv;
+  MV start_mv = *bestmv;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+
+  SubpelMvLimits mv_limits;
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
+                                 mv_cost_params->ref_mv);
+
+  (void)cm;
+
+  besterr = setup_center_error(xd, bestmv, src_address, src_stride, ref_address,
+                               ref_stride, var_params, mv_cost_params, sse1,
+                               distortion);
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    int dummy = 0;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
+      check_better_fast(&this_mv, bestmv, &mv_limits, src_address, src_stride,
+                        ref_address, ref_stride, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    if (forced_stop != HALF_PEL) {
+      hstep >>= 1;
+      start_mv = *bestmv;
+      two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                            src_stride, ref_address, ref_stride, var_params,
+                            mv_cost_params, &besterr, sse1, distortion,
+                            iters_per_step);
+    }
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCK *x, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
+    unsigned int *sse1) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  const uint8_t *const ref_address = xd->plane[0].pre[0].buf;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+
+  convert_fullmv_to_mv(&x->best_mv);
+  MV *bestmv = &x->best_mv.as_mv;
+  MV start_mv = *bestmv;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+
+  SubpelMvLimits mv_limits;
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
+                                 mv_cost_params->ref_mv);
+
+  (void)cm;
+
+  besterr = setup_center_error(xd, bestmv, src_address, src_stride, ref_address,
+                               ref_stride, var_params, mv_cost_params, sse1,
+                               distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + ir * hstep,
+                           start_mv.col + ic * hstep };
+      int dummy = 0;
+      check_better_fast(&this_mv, bestmv, &mv_limits, src_address, src_stride,
+                        ref_address, ref_stride, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCK *x, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
+    unsigned int *sse1) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  const uint8_t *const ref_address = xd->plane[0].pre[0].buf;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+
+  convert_fullmv_to_mv(&x->best_mv);
+  MV *bestmv = &x->best_mv.as_mv;
+  MV start_mv = *bestmv;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+
+  SubpelMvLimits mv_limits;
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits,
+                                 mv_cost_params->ref_mv);
+  (void)cm;
+
+  besterr = setup_center_error(xd, bestmv, src_address, src_stride, ref_address,
+                               ref_stride, var_params, mv_cost_params, sse1,
+                               distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+                                  (cost_list[2] < cost_list[4] ? 0 : 2);
+
+    const MV left_mv = { start_mv.row, start_mv.col - hstep };
+    const MV right_mv = { start_mv.row, start_mv.col + hstep };
+    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+    const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+    int dummy = 0;
+
+    switch (whichdir) {
+      case 0:  // bottom left quadrant
+        check_better_fast(&left_mv, bestmv, &mv_limits, src_address, src_stride,
+                          ref_address, ref_stride, var_params, mv_cost_params,
+                          &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_left_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 1:  // bottom right quadrant
+        check_better_fast(&right_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_right_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 2:  // top left quadrant
+        check_better_fast(&left_mv, bestmv, &mv_limits, src_address, src_stride,
+                          ref_address, ref_stride, var_params, mv_cost_params,
+                          &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, &mv_limits, src_address, src_stride,
+                          ref_address, ref_stride, var_params, mv_cost_params,
+                          &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_left_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 3:  // top right quadrant
+        check_better_fast(&right_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, &mv_limits, src_address, src_stride,
+                          ref_address, ref_stride, var_params, mv_cost_params,
+                          &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_right_mv, bestmv, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+    }
+  } else {
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(&start_mv, bestmv, hstep, &mv_limits, src_address,
+                          src_stride, ref_address, ref_stride, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+  // left, right, up, down
+  { 0, -INIT_SUBPEL_STEP_SIZE },        { 0, INIT_SUBPEL_STEP_SIZE },
+  { -INIT_SUBPEL_STEP_SIZE, 0 },        { INIT_SUBPEL_STEP_SIZE, 0 },
+  { 0, -(INIT_SUBPEL_STEP_SIZE >> 1) }, { 0, (INIT_SUBPEL_STEP_SIZE >> 1) },
+  { -(INIT_SUBPEL_STEP_SIZE >> 1), 0 }, { (INIT_SUBPEL_STEP_SIZE >> 1), 0 },
+  { 0, -(INIT_SUBPEL_STEP_SIZE >> 2) }, { 0, (INIT_SUBPEL_STEP_SIZE >> 2) },
+  { -(INIT_SUBPEL_STEP_SIZE >> 2), 0 }, { (INIT_SUBPEL_STEP_SIZE >> 2), 0 }
+};
+/* clang-format on */
+
+int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 int *distortion, unsigned int *sse1) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int do_reset_fractional_mv = ms_params->do_reset_fractional_mv;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const MV *ref_mv = mv_cost_params->ref_mv;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const ref_address = xd->plane[0].pre[0].buf;
+
+  convert_fullmv_to_mv(&x->best_mv);
+  MV *bestmv = &x->best_mv.as_mv;
+
+  SubpelMvLimits mv_limits;
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, ref_mv);
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+  int iter, round = FULL_PEL - forced_stop;
+  const MV *search_step = search_step_table;
+  unsigned int cost_array[5];
+  unsigned int besterr = INT_MAX;
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG) {
+    besterr = upsampled_setup_center_error(
+        xd, cm, bestmv, src_address, src_stride, ref_address, ref_stride,
+        var_params, mv_cost_params, sse1, distortion);
+  } else {
+    besterr = setup_center_error(xd, bestmv, src_address, src_stride,
+                                 ref_address, ref_stride, var_params,
+                                 mv_cost_params, sse1, distortion);
+  }
+
+  if (do_reset_fractional_mv) {
+    av1_set_fractional_mv(x->fractional_best_mv);
+  }
+
+  MV iter_center_mv = *bestmv;
+  for (iter = 0; iter < round; ++iter) {
+    if (x->fractional_best_mv[iter].as_mv.row == iter_center_mv.row &&
+        x->fractional_best_mv[iter].as_mv.col == iter_center_mv.col)
+      return INT_MAX;
+
+    x->fractional_best_mv[iter].as_mv = iter_center_mv;
+
+    MV best_iter_mv = iter_center_mv;
+    int iter_best_idx = -1;
+
+    // Check vertical and horizontal sub-pixel positions.
+    for (int idx = 0; idx < 4; ++idx) {
+      const MV this_mv = { iter_center_mv.row + search_step[idx].row,
+                           iter_center_mv.col + search_step[idx].col };
+
+      int has_better_mv = 0;
+      if (subpel_search_type != USE_2_TAPS_ORIG) {
+        cost_array[idx] = check_better(
+            xd, cm, &this_mv, &best_iter_mv, &mv_limits, src_address,
+            src_stride, ref_address, ref_stride, var_params, mv_cost_params,
+            &besterr, sse1, distortion, &has_better_mv);
+      } else {
+        cost_array[idx] = check_better_fast(
+            &this_mv, &best_iter_mv, &mv_limits, src_address, src_stride,
+            ref_address, ref_stride, var_params, mv_cost_params, &besterr, sse1,
+            distortion, &has_better_mv);
+      }
+      if (has_better_mv) {
+        iter_best_idx = idx;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    const MV diag_step = { (cost_array[2] <= cost_array[3] ? -hstep : hstep),
+                           (cost_array[0] <= cost_array[1] ? -hstep : hstep) };
+    const MV diag_mv = { iter_center_mv.row + diag_step.row,
+                         iter_center_mv.col + diag_step.col };
+    int has_better_mv = 0;
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      cost_array[4] = check_better(xd, cm, &diag_mv, &best_iter_mv, &mv_limits,
+                                   src_address, src_stride, ref_address,
+                                   ref_stride, var_params, mv_cost_params,
+                                   &besterr, sse1, distortion, &has_better_mv);
+    } else {
+      cost_array[4] = check_better_fast(
+          &diag_mv, &best_iter_mv, &mv_limits, src_address, src_stride,
+          ref_address, ref_stride, var_params, mv_cost_params, &besterr, sse1,
+          distortion, &has_better_mv);
+    }
+    if (has_better_mv) {
+      iter_best_idx = 4;
+    }
+
+    if (iter_best_idx != -1) {
+      iter_center_mv = best_iter_mv;
+
+      if (iters_per_step > 1) {
+        second_level_check_v2(xd, cm, &diag_mv, &iter_center_mv, diag_step.row,
+                              diag_step.col, &mv_limits, src_address,
+                              src_stride, ref_address, ref_stride, var_params,
+                              mv_cost_params, &besterr, sse1, distortion);
+      }
+    }
+
+    search_step += 4;
+    hstep >>= 1;
+  }
+
+  *bestmv = iter_center_mv;
+
+  return besterr;
+}
+
 // Note(yunqingwang): The following 2 functions are only used in the motion
 // vector unit test, which return extreme motion vectors allowed by the MV
 // limits.
-// Return the maximum MV.
+
+// Returns the maximum MV.
 int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
                                 int *distortion, unsigned int *sse1) {
@@ -3282,6 +2842,7 @@
   lower_mv_precision(bestmv, allow_hp, 0);
   return besterr;
 }
+
 // Return the minimum MV.
 int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
@@ -3309,113 +2870,491 @@
   return besterr;
 }
 
-void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize, int ref,
-                              FULLPEL_MV start_mv, int num_planes,
-                              int use_subpixel) {
-  assert(num_planes == 1 &&
-         "Currently simple_motion_search only supports luma plane");
-  assert(!frame_is_intra_only(&cpi->common) &&
-         "Simple motion search only enabled for non-key frames");
-  AV1_COMMON *const cm = &cpi->common;
+// Refine MV in a small range
+unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                  BLOCK_SIZE bsize, int *pts0, int *pts_inref0,
+                                  int total_samples) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-
-  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
-
   MB_MODE_INFO *mbmi = xd->mi[0];
-  mbmi->sb_type = bsize;
-  mbmi->ref_frame[0] = ref;
-  mbmi->ref_frame[1] = NONE_FRAME;
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+  const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+                            { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+  int16_t br = mbmi->mv[0].as_mv.row;
+  int16_t bc = mbmi->mv[0].as_mv.col;
+  int16_t *tr = &mbmi->mv[0].as_mv.row;
+  int16_t *tc = &mbmi->mv[0].as_mv.col;
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
+  unsigned int bestmse;
+  SubpelMvLimits mv_limits;
 
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-  struct buf_2d backup_yv12;
-  // ref_mv is used to calculate the cost of the motion vector
-  const MV ref_mv = kZeroMv;
-  const int step_param = cpi->mv_step_param;
-  const FullMvLimits tmp_mv_limits = x->mv_limits;
-  const SEARCH_METHODS search_methods = cpi->sf.mv_sf.search_method;
-  const int do_mesh_search = 0;
-  const int sadpb = x->sadperbit16;
-  int cost_list[5];
-  const int ref_idx = 0;
-  int var;
+  const int start = cm->allow_high_precision_mv ? 0 : 4;
+  int ite;
 
-  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
-                       get_ref_scale_factors(cm, ref), num_planes);
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-  if (scaled_ref_frame) {
-    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
-                         num_planes);
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+
+  // Calculate the center position's error
+  assert(av1_is_subpelmv_in_range(&mv_limits, mbmi->mv[0].as_mv));
+  bestmse = av1_compute_motion_cost(cpi, x, bsize, &mbmi->mv[0].as_mv);
+
+  // MV search
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (ite = 0; ite < 2; ++ite) {
+    int best_idx = -1;
+    int idx;
+
+    for (idx = start; idx < start + 4; ++idx) {
+      unsigned int thismse;
+
+      *tr = br + neighbors[idx].row;
+      *tc = bc + neighbors[idx].col;
+
+      MV this_mv = { *tr, *tc };
+      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
+        int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1)
+          mbmi->num_proj_ref =
+              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+
+        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+                                 *tc, &mbmi->wm_params, mi_row, mi_col)) {
+          thismse = av1_compute_motion_cost(cpi, x, bsize, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      br += neighbors[best_idx].row;
+      bc += neighbors[best_idx].col;
+    }
   }
 
-  // This overwrites the mv_limits so we will need to restore it later.
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
-  var = av1_full_pixel_search(
-      cpi, x, bsize, &start_mv, step_param, search_methods, do_mesh_search,
-      sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
-      mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC], 0);
-  // Restore
-  x->mv_limits = tmp_mv_limits;
-
-  const int use_subpel_search =
-      var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
-  }
-  if (use_subpel_search) {
-    int not_used = 0;
-
-    const uint8_t *second_pred = NULL;
-    const uint8_t *mask = NULL;
-    const int mask_stride = 0;
-    const int invert_mask = 0;
-    const int reset_fractional_mv = 1;
-    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
-                                      cost_list, second_pred, mask, mask_stride,
-                                      invert_mask, reset_fractional_mv);
-
-    cpi->find_fractional_mv_step(x, cm, &ms_params, &not_used,
-                                 &x->pred_sse[ref]);
-  } else {
-    // Manually convert from units of pixel to 1/8-pixels if we are not doing
-    // subpel search
-    x->best_mv.as_mv = get_mv_from_fullmv(&x->best_mv.as_fullmv);
-  }
-
-  mbmi->mv[0] = x->best_mv;
-
-  // Get a copy of the prediction output
-  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
-                                AOM_PLANE_Y, AOM_PLANE_Y);
-
-  aom_clear_system_state();
-
-  if (scaled_ref_frame) {
-    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
-  }
+  *tr = br;
+  *tc = bc;
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
+  return bestmse;
 }
 
-void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               int mi_col, BLOCK_SIZE bsize,
-                               const FULLPEL_MV start_mv, int use_subpixel,
-                               unsigned int *sse, unsigned int *var) {
+// =============================================================================
+//  Subpixel Motion Search: OBMC
+// =============================================================================
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                             \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), src_address, mask, \
+            &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(diff_mv)                                                          \
+  (unsigned int)(mvcost                                                       \
+                     ? (mv_cost((diff_mv), mvjcost, mvcost) * error_per_bit + \
+                        4096) >>                                              \
+                           13                                                 \
+                     : 0)
+
+#define CHECK_BETTER(v, r, c)                                  \
+  {                                                            \
+    const MV this_mv = { r, c };                               \
+    if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {       \
+      const MV diff_mv = { r - ref_mv->row, c - ref_mv->col }; \
+      thismse = (DIST(r, c));                                  \
+      if ((v = MVC(&diff_mv) + thismse) < besterr) {           \
+        besterr = v;                                           \
+        br = r;                                                \
+        bc = c;                                                \
+        *distortion = thismse;                                 \
+        *sse1 = sse;                                           \
+      }                                                        \
+    } else {                                                   \
+      v = INT_MAX;                                             \
+    }                                                          \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c)                                              \
+  {                                                                         \
+    const MV this_mv = { r, c };                                            \
+    if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {                    \
+      thismse = upsampled_obmc_pref_error(                                  \
+          xd, cm, &this_mv, mask, vfp, src_address, pre(y, y_stride, r, c), \
+          y_stride, sp(c), sp(r), w, h, &sse, subpel_search_type);          \
+      v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,     \
+                      mv_cost_type);                                        \
+      if ((v + thismse) < besterr) {                                        \
+        besterr = v + thismse;                                              \
+        br = r;                                                             \
+        bc = c;                                                             \
+        *distortion = thismse;                                              \
+        *sse1 = sse;                                                        \
+      }                                                                     \
+    } else {                                                                \
+      v = INT_MAX;                                                          \
+    }                                                                       \
+  }
+
+static unsigned int setup_obmc_center_error(
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, const int *mvjcost,
+    const int *const mvcost[2], unsigned int *sse1, int *distortion,
+    MV_COST_TYPE mv_cost_type) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y, y_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr +=
+      mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit, mv_cost_type);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *const mv,
+    const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
+    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+    int subpel_search) {
+  unsigned int besterr;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                              subpel_search);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, y, y_stride, subpel_search);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+  }
+#else
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                     subpel_y_q3, y, y_stride, subpel_search);
+
+  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+  return besterr;
+}
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST(k)                \
+  {                                                \
+    unsigned int second;                           \
+    int br0 = br;                                  \
+    int bc0 = bc;                                  \
+    assert(tr == br || tc == bc);                  \
+    if (tr == br && tc != bc) {                    \
+      kc = bc - tc;                                \
+    } else if (tr != br && tc == bc) {             \
+      kr = br - tr;                                \
+    }                                              \
+    CHECK_BETTER##k(second, br0 + kr, bc0);        \
+    CHECK_BETTER##k(second, br0, bc0 + kc);        \
+    if (br0 != br || bc0 != bc) {                  \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
+    }                                              \
+    (void)second;                                  \
+  }
+
+#define UNPACK_OBMC_MS_PARAMS                                               \
+  const int allow_hp = ms_params->allow_hp;                                 \
+  const int forced_stop = ms_params->forced_stop;                           \
+  const int iters_per_step = ms_params->iters_per_step;                     \
+  const MV *ref_mv = ms_params->mv_cost_params.ref_mv;                      \
+  const int *mvjcost = ms_params->mv_cost_params.mvjcost;                   \
+  const int *const *mvcost = ms_params->mv_cost_params.mvcost;              \
+  const int error_per_bit = ms_params->mv_cost_params.error_per_bit;        \
+  const MV_COST_TYPE mv_cost_type = ms_params->mv_cost_params.mv_cost_type; \
+  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;             \
+  const SUBPEL_SEARCH_TYPE subpel_search_type =                             \
+      ms_params->var_params.subpel_search_type;                             \
+  const int w = ms_params->var_params.w;                                    \
+  const int h = ms_params->var_params.h;
+
+static unsigned int upsampled_setup_obmc_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const int32_t *mask,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int w, int h, const int *mvjcost,
+    const int *const mvcost[2], unsigned int *sse1, int *distortion,
+    int subpel_search, MV_COST_TYPE mv_cost_type) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, bestmv, mask, vfp, wsrc, y, y_stride, 0,
+                                0, w, h, sse1, subpel_search);
+  *distortion = besterr;
+  besterr +=
+      mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit, mv_cost_type);
+  return besterr;
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCK *x, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, int *distortion,
+    unsigned int *sse1) {
+  UNPACK_OBMC_MS_PARAMS;
+  const int32_t *wsrc = x->wsrc_buf;
+  const int32_t *mask = x->mask_buf;
+
+  const int32_t *const src_address = wsrc;
   MACROBLOCKD *xd = &x->e_mbd;
-  const MV_REFERENCE_FRAME ref =
-      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+  const int y_stride = pd->pre[0].stride;
+  const int offset = get_offset_from_mv(&x->best_mv.as_fullmv, y_stride);
+  const uint8_t *y = pd->pre[0].buf;
+  convert_fullmv_to_mv(&x->best_mv);
+  MV *bestmv = &x->best_mv.as_mv;
 
-  av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, start_mv, 1,
-                           use_subpixel);
+  int br = bestmv->row;
+  int bc = bestmv->col;
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+  int iter, round = FULL_PEL - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
 
-  const uint8_t *src = x->plane[0].src.buf;
+  SubpelMvLimits mv_limits;
+
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, ref_mv);
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, cm, mask, bestmv, ref_mv, error_per_bit, vfp, src_address,
+        y + offset, y_stride, w, h, mvjcost, mvcost, sse1, distortion,
+        subpel_search_type, mv_cost_type);
+  else
+    besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
+                                      src_address, y, y_stride, mvjcost, mvcost,
+                                      sse1, distortion, mv_cost_type);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      MV this_mv = { tr, tc };
+      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
+        if (subpel_search_type != USE_2_TAPS_ORIG) {
+          thismse = upsampled_obmc_pref_error(
+              xd, cm, &this_mv, mask, vfp, src_address,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+              subpel_search_type);
+        } else {
+          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+                              sp(tr), src_address, mask, &sse);
+        }
+
+        cost_array[idx] =
+            thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                  error_per_bit, mv_cost_type);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    {
+      MV this_mv = { tr, tc };
+      if (av1_is_subpelmv_in_range(&mv_limits, this_mv)) {
+        if (subpel_search_type != USE_2_TAPS_ORIG) {
+          thismse = upsampled_obmc_pref_error(
+              xd, cm, &this_mv, mask, vfp, src_address,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+              subpel_search_type);
+        } else {
+          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+                              sp(tr), src_address, mask, &sse);
+        }
+
+        cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                              error_per_bit, mv_cost_type);
+
+        if (cost_array[4] < besterr) {
+          best_idx = 4;
+          besterr = cost_array[4];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (subpel_search_type != USE_2_TAPS_ORIG) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+// =============================================================================
+//  Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int sse, var;
+
+  var = vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                in_what->stride, &sse);
+  (void)var;
+
+  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
+                           x->errorperbit, mv_cost_type);
+}
+
+int av1_get_mvpred_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int sse, var;
+
+  var = vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                in_what->stride, &sse);
+
+  return var + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
+                           x->errorperbit, mv_cost_type);
+}
+
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                          const MV *ref_mv, const uint8_t *second_pred,
+                          const aom_variance_fn_ptr_t *vfp,
+                          const struct buf_2d *src, const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+         mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
+                     mv_cost_type);
+}
+
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                            const MV *ref_mv, const uint8_t *second_pred,
+                            const uint8_t *mask, int mask_stride,
+                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
+                            const struct buf_2d *src,
+                            const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int unused;
+
+  return vfp->msvf(what->buf, what->stride, 0, 0,
+                   get_buf_from_mv(in_what, best_mv), in_what->stride,
+                   second_pred, mask, mask_stride, invert_mask, &unused) +
+         mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
+                     mv_cost_type);
+}
+
+unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                     BLOCK_SIZE bsize, const MV *this_mv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *const src = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
-  const uint8_t *dst = xd->plane[0].dst.buf;
+  uint8_t *const dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+  unsigned int mse;
+  unsigned int sse;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
 
-  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                     CONVERT_TO_CONST_MVCOST(x->mv_cost_stack), x->errorperbit,
+                     mv_cost_type);
+  return mse;
 }
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index dd88e11..79450bf 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -21,16 +21,6 @@
 extern "C" {
 #endif
 
-// In this file, the following variables always have the same meaning:
-// start_mv: the motion vector where we start the motion search
-// ref_mv: the motion vector with respect to which we calculate the mv_cost
-// best_mv: when it is not const, it is the destination where to store the
-//   best motion vector
-// full_*: a prefix of full indicates that the mv is a FULLPEL_MV
-//
-// When a mv needs to both act as a fullpel_mv and subpel_mv, it is stored as an
-// int_mv, which is a union of int, FULLPEL_MV, and MV
-
 // The maximum number of steps in a step search given the largest
 // allowed initial step
 #define MAX_MVSEARCH_STEPS 11
@@ -63,20 +53,27 @@
 } search_site_config;
 
 typedef struct {
-  MV coord;
+  FULLPEL_MV coord;
   int coord_offset;
 } search_neighbors;
 
-void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
-void av1_init_motion_fpf(search_site_config *cfg, int stride);
-void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+struct AV1_COMP;
+struct SPEED_FEATURES;
 
-void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+// =============================================================================
+//  Cost functions
+// =============================================================================
+typedef struct {
+  const MV *ref_mv;
+  const int *mvjcost;
+  const int *mvcost[2];
+  int error_per_bit;
+  MV_COST_TYPE mv_cost_type;
+} MV_COST_PARAMS;
 
 int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
                     int *mvcost[2], int weight);
 
-// Utility to compute variance + MV rate cost for a given MV
 int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
                        const MV *ref_mv, const aom_variance_fn_ptr_t *vfp);
 int av1_get_mvpred_var(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
@@ -91,8 +88,21 @@
                             int invert_mask, const aom_variance_fn_ptr_t *vfp,
                             const struct buf_2d *src, const struct buf_2d *pre);
 
-struct AV1_COMP;
-struct SPEED_FEATURES;
+unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
+                                     MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                     const MV *this_mv);
+
+// =============================================================================
+//  Fullpixel Motion Search
+// =============================================================================
+// Sets up configs for fullpixel diamond search
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+// Sets up configs for firstpass motion search
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for all other types of motion search
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
 
 int av1_init_search_range(int size);
 
@@ -100,12 +110,54 @@
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
                                            int mi_row, int mi_col,
                                            const MV *ref_mv);
-
 // Runs sequence of diamond searches in smaller steps for RD.
 int av1_hex_search(MACROBLOCK *x, FULLPEL_MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, const MV *ref_mv);
 
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask, const MV *ref_mv,
+                             const uint8_t *second_pred,
+                             const struct buf_2d *src,
+                             const struct buf_2d *pre);
+
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
+                             FULLPEL_MV *start_mv, FULLPEL_MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *ref_mv, uint8_t *second_pred,
+                             uint8_t *mask, int mask_stride, int inv_mask);
+
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, FULLPEL_MV *start_mv,
+                          int step_param, int method, int run_mesh_search,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          int var_max, int rd, int x_pos, int y_pos, int intra,
+                          const search_site_config *cfg,
+                          int use_intrabc_mesh_pattern);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               FULLPEL_MV *start_mv, int step_param, int sadpb,
+                               int further_steps, int do_refine,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *ref_mv, FULLPEL_MV *dst_mv,
+                               const search_site_config *cfg);
+
+unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
+                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int *pts0, int *pts_inref0,
+                                  int total_samples);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+                                         FULLPEL_MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+//  Subpixel Motion Search
+// =============================================================================
 enum {
   EIGHTH_PEL,
   QUARTER_PEL,
@@ -114,14 +166,6 @@
 } UENUM1BYTE(SUBPEL_FORCE_STOP);
 
 typedef struct {
-  const MV *ref_mv;
-  const int *mvjcost;
-  const int *mvcost[2];
-  int error_per_bit;
-  MV_COST_TYPE mv_cost_type;
-} MV_COST_PARAMS;
-
-typedef struct {
   const aom_variance_fn_ptr_t *vfp;
   SUBPEL_SEARCH_TYPE subpel_search_type;
   const uint8_t *second_pred;
@@ -164,61 +208,8 @@
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
-
-int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
-                             const aom_variance_fn_ptr_t *fn_ptr,
-                             const uint8_t *mask, int mask_stride,
-                             int invert_mask, const MV *ref_mv,
-                             const uint8_t *second_pred,
-                             const struct buf_2d *src,
-                             const struct buf_2d *pre);
-
-int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
-                             FULLPEL_MV *start_mv, FULLPEL_MV *best_mv,
-                             int search_param, int sad_per_bit, int *num00,
-                             const aom_variance_fn_ptr_t *fn_ptr,
-                             const MV *ref_mv, uint8_t *second_pred,
-                             uint8_t *mask, int mask_stride, int inv_mask);
-
-int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, FULLPEL_MV *start_mv,
-                          int step_param, int method, int run_mesh_search,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          int var_max, int rd, int x_pos, int y_pos, int intra,
-                          const search_site_config *cfg,
-                          int use_intrabc_mesh_pattern);
-
-int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                               FULLPEL_MV *start_mv, int step_param, int sadpb,
-                               int further_steps, int do_refine,
-                               const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *ref_mv, FULLPEL_MV *dst_mv,
-                               const search_site_config *cfg);
-
 extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
 
-unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
-                                     MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                     const MV *this_mv);
-unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
-                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
-                                  int *pts0, int *pts_inref0,
-                                  int total_samples);
-
-// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
-// ref. Note that this sets the offset of mbmi, so we will need to reset it
-// after calling this function.
-void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize, int ref,
-                              FULLPEL_MV start_mv, int num_planes,
-                              int use_subpixel);
-
-// Performs a simple motion search to calculate the sse and var of the residue
-void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                               int mi_col, BLOCK_SIZE bsize,
-                               const FULLPEL_MV start_mv, int use_subpixel,
-                               unsigned int *sse, unsigned int *var);
-
 static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
   for (int z = 0; z < 3; z++) {
     fractional_best_mv[z].as_int = INVALID_MV;
@@ -244,12 +235,6 @@
   subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
 }
 
-static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
-                                         FULLPEL_MV mv) {
-  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
-         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
-}
-
 static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
                                            MV mv) {
   return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index a43d4cd..ba4ce6f 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -9,9 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "aom_ports/system_state.h"
+
 #include "av1/common/reconinter.h"
+
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -615,3 +619,114 @@
   }
   return tmp_rate_mv;
 }
+
+void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int ref,
+                              FULLPEL_MV start_mv, int num_planes,
+                              int use_subpixel) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to calculate the cost of the motion vector
+  const MV ref_mv = kZeroMv;
+  const int step_param = cpi->mv_step_param;
+  const FullMvLimits tmp_mv_limits = x->mv_limits;
+  const SEARCH_METHODS search_methods = cpi->sf.mv_sf.search_method;
+  const int do_mesh_search = 0;
+  const int sadpb = x->sadperbit16;
+  int cost_list[5];
+  const int ref_idx = 0;
+  int var;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // This overwrites the mv_limits so we will need to restore it later.
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+  var = av1_full_pixel_search(
+      cpi, x, bsize, &start_mv, step_param, search_methods, do_mesh_search,
+      sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+      mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC], 0);
+  // Restore
+  x->mv_limits = tmp_mv_limits;
+
+  const int use_subpel_search =
+      var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+
+    const uint8_t *second_pred = NULL;
+    const uint8_t *mask = NULL;
+    const int mask_stride = 0;
+    const int invert_mask = 0;
+    const int reset_fractional_mv = 1;
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list, second_pred, mask, mask_stride,
+                                      invert_mask, reset_fractional_mv);
+
+    cpi->find_fractional_mv_step(x, cm, &ms_params, &not_used,
+                                 &x->pred_sse[ref]);
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    x->best_mv.as_mv = get_mv_from_fullmv(&x->best_mv.as_fullmv);
+  }
+
+  mbmi->mv[0] = x->best_mv;
+
+  // Get a copy of the prediction output
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  aom_clear_system_state();
+
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+}
+
+void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, BLOCK_SIZE bsize,
+                               const FULLPEL_MV start_mv, int use_subpixel,
+                               unsigned int *sse, unsigned int *var) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const MV_REFERENCE_FRAME ref =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+
+  av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, start_mv, 1,
+                           use_subpixel);
+
+  const uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+
+  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+}
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index 4bfe06d..960df34 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -43,6 +43,20 @@
                                        const uint8_t *mask, int mask_stride,
                                        int *rate_mv, int ref_idx);
 
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref. Note that this sets the offset of mbmi, so we will need to reset it
+// after calling this function.
+void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize, int ref,
+                              FULLPEL_MV start_mv, int num_planes,
+                              int use_subpixel);
+
+// Performs a simple motion search to calculate the sse and var of the residue
+void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, BLOCK_SIZE bsize,
+                               const FULLPEL_MV start_mv, int use_subpixel,
+                               unsigned int *sse, unsigned int *var);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 8832944..883208b 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -25,6 +25,7 @@
 #endif
 #include "av1/encoder/encoder.h"
 
+#include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"