encoder: Remove 64x upsampled reference buffers

They do not handle border extension correctly (interpolation and
border extension do not commute unless you upsample into the
border), nor do they handle crop dimensions that are not a multiple
of 8 (the upsampled version is not sufficiently large), in addition
to using massive amounts of memory and being a criminal waste of
cache (1 byte used for every 8 bytes fetched).

This commit reimplements use_upsampled_references by computing the
subpixel samples on the fly. This implementation not only corrects
the border handling, but is also faster, while maintaining the
same quality.

HL AWCY results are basically noise:
    PSNR | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
  0.0188 |   0.0187 | 0.0045 |  0.0063 |     0.0228

Change-Id: I7527db9f83b87a7bb8b35342f7e6457cd0bef9cd
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 8b2036d..fd8d538 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -468,7 +468,6 @@
 
 static void dealloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  int i;
 
   aom_free(cpi->mbmi_ext_base);
   cpi->mbmi_ext_base = NULL;
@@ -514,10 +513,6 @@
   cpi->td.mb.mask_buf = NULL;
 #endif
 
-  // Free up-sampled reference buffers.
-  for (i = 0; i < (REF_FRAMES + 1); i++)
-    aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
-
   av1_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_LV_MAP
   av1_free_txb_buf(cpi);
@@ -530,8 +525,11 @@
   aom_free_frame_buffer(&cpi->last_frame_db);
   aom_free_frame_buffer(&cpi->trial_frame_rst);
   aom_free(cpi->extra_rstbuf);
-  for (i = 0; i < MAX_MB_PLANE; ++i)
-    av1_free_restoration_struct(&cpi->rst_search[i]);
+  {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; ++i)
+      av1_free_restoration_struct(&cpi->rst_search[i]);
+  }
 #endif  // CONFIG_LOOP_RESTORATION
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
@@ -2118,15 +2116,6 @@
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
 }
 
-static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
-  int i;
-
-  for (i = 0; i < (REF_FRAMES + 1); ++i) {
-    cpi->upsampled_ref_bufs[i].ref_count = 0;
-    cpi->upsampled_ref_idx[i] = INVALID_IDX;
-  }
-}
-
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -2334,8 +2323,6 @@
 
 #endif
 
-  init_upsampled_ref_frame_bufs(cpi);
-
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_speed_features_framesize_dependent(cpi);
 
@@ -3010,52 +2997,6 @@
   return force_recode;
 }
 
-static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
-  int i;
-
-  for (i = 0; i < (REF_FRAMES + 1); i++) {
-    if (!ubufs[i].ref_count) {
-      return i;
-    }
-  }
-  return INVALID_IDX;
-}
-
-// Up-sample 1 reference frame.
-static INLINE int upsample_ref_frame(AV1_COMP *cpi,
-                                     const YV12_BUFFER_CONFIG *const ref) {
-  AV1_COMMON *const cm = &cpi->common;
-  EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
-  int new_uidx = get_free_upsampled_ref_buf(ubufs);
-
-  if (new_uidx == INVALID_IDX) {
-    return INVALID_IDX;
-  } else {
-    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
-
-    // Can allocate buffer for Y plane only.
-    if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
-      if (aom_realloc_frame_buffer(upsampled_ref, (cm->width << 3),
-                                   (cm->height << 3), cm->subsampling_x,
-                                   cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   (AOM_BORDER_IN_PIXELS << 3),
-                                   cm->byte_alignment, NULL, NULL, NULL))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to allocate up-sampled frame buffer");
-
-// Currently, only Y plane is up-sampled, U, V are not used.
-#if CONFIG_HIGHBITDEPTH
-    scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
-#else
-    scale_and_extend_frame(ref, upsampled_ref, 1);
-#endif
-    return new_uidx;
-  }
-}
-
 #define DUMP_REF_FRAME_IMAGES 0
 
 #if DUMP_REF_FRAME_IMAGES == 1
@@ -3168,31 +3109,11 @@
 void av1_update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
-  int new_uidx = 0;
 
   // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-  if (use_upsampled_ref) {
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
-      // TODO(zoeliu): Once following is confirmed, remove it.
-      assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
-    } else {
-#endif  // CONFIG_EXT_REFS
-      // Up-sample the current encoded frame.
-      RefCntBuffer *bufs = pool->frame_bufs;
-      const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
-
-      new_uidx = upsample_ref_frame(cpi, ref);
-#if CONFIG_EXT_REFS
-      assert(new_uidx != INVALID_IDX);
-    }
-#endif  // CONFIG_EXT_REFS
-  }
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
@@ -3204,17 +3125,6 @@
 #endif  // CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
-
-    if (use_upsampled_ref) {
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
-#if CONFIG_EXT_REFS
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
-#endif  // CONFIG_EXT_REFS
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-    }
   } else if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
@@ -3228,10 +3138,6 @@
 
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
-    if (use_upsampled_ref)
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
@@ -3292,9 +3198,6 @@
       }
 #endif  // CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx],
-                    new_uidx);
 
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
              cpi->interp_filter_selected[0],
@@ -3304,9 +3207,6 @@
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                  cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs,
-                    &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
 
 #if !CONFIG_EXT_REFS
       if (!cpi->rc.is_src_frame_alt_ref)
@@ -3331,9 +3231,6 @@
 
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs,
-                    &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
 
       memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
              cpi->interp_filter_selected[0],
@@ -3390,11 +3287,6 @@
         ref_cnt_fb(pool->frame_bufs,
                    &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
                    cm->new_fb_idx);
-
-        if (use_upsampled_ref)
-          uref_cnt_fb(cpi->upsampled_ref_bufs,
-                      &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]],
-                      new_uidx);
       }
     } else {
       int tmp;
@@ -3403,12 +3295,6 @@
                  &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
                  cm->new_fb_idx);
 
-      if (use_upsampled_ref)
-        uref_cnt_fb(
-            cpi->upsampled_ref_bufs,
-            &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
-            new_uidx);
-
       tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
       shift_last_ref_frames(cpi);
@@ -3424,9 +3310,6 @@
 #else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
                cm->new_fb_idx);
-    if (use_upsampled_ref)
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
     if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
@@ -3531,31 +3414,6 @@
           alloc_frame_mvs(cm, new_fb);
         }
 #endif  // CONFIG_HIGHBITDEPTH
-
-        if (cpi->sf.use_upsampled_references &&
-            (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-             new_fb_ptr->buf.y_crop_height != cm->height)) {
-          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
-          EncRefCntBuffer *ubuf =
-              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
-
-          if (aom_realloc_frame_buffer(&ubuf->buf, (cm->width << 3),
-                                       (cm->height << 3), cm->subsampling_x,
-                                       cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                       cm->use_highbitdepth,
-#endif
-                                       (AOM_BORDER_IN_PIXELS << 3),
-                                       cm->byte_alignment, NULL, NULL, NULL))
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate up-sampled frame buffer");
-#if CONFIG_HIGHBITDEPTH
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
-                                 (int)cm->bit_depth);
-#else
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
-#endif
-        }
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -4018,26 +3876,9 @@
   aom_extend_frame_borders(cm->frame_to_show);
 }
 
-static void reset_use_upsampled_references(AV1_COMP *cpi) {
-  MV_REFERENCE_FRAME ref_frame;
-
-  // reset up-sampled reference buffer structure.
-  init_upsampled_ref_frame_bufs(cpi);
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, ref_frame);
-    int new_uidx = upsample_ref_frame(cpi, ref);
-
-    // Update the up-sampled reference index.
-    cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] = new_uidx;
-    cpi->upsampled_ref_bufs[new_uidx].ref_count++;
-  }
-}
-
 static void encode_without_recode_loop(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   aom_clear_system_state();
 
@@ -4048,13 +3889,6 @@
 
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-  // cpi->sf.use_upsampled_references can be different from frame to frame.
-  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
-  // The reference frames for this frame have to be up-sampled before encoding.
-  if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
-      cm->frame_type != KEY_FRAME)
-    reset_use_upsampled_references(cpi);
-
   cpi->source =
       av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
   if (cpi->unscaled_last_source != NULL)
@@ -4110,7 +3944,6 @@
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   set_size_independent_vars(cpi);
 
@@ -4122,15 +3955,6 @@
     if (loop_count == 0) {
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-      // cpi->sf.use_upsampled_references can be different from frame to frame.
-      // Every time when cpi->sf.use_upsampled_references is changed from 0 to
-      // 1.
-      // The reference frames for this frame have to be up-sampled before
-      // encoding.
-      if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
-          cm->frame_type != KEY_FRAME)
-        reset_use_upsampled_references(cpi);
-
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 488df9e..1cc7fb2 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -397,13 +397,6 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  // Up-sampled reference buffers
-  // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled
-  // reference buffers, which should include the up-sampled version of all the
-  // possibly stored references plus the currently coded frame itself.
-  EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1];
-  int upsampled_ref_idx[REF_FRAMES + 1];
-
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
@@ -749,14 +742,6 @@
                                 : NULL;
 }
 
-static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
-    const AV1_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
-  // Use up-sampled reference frames.
-  const int buf_idx =
-      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
-  return &cpi->upsampled_ref_bufs[buf_idx].buf;
-}
-
 #if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 79543c4..b49e080 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -228,49 +228,45 @@
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
-static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
-                                  int c) {
-  return &buf[(r)*stride + (c)];
-}
-
 /* checks if (r, c) has better score than previous best */
 #if CONFIG_EXT_INTER
-#define CHECK_BETTER1(v, r, c)                                               \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    MV this_mv = { r, c };                                                   \
-    thismse = upsampled_pref_error(                                          \
-        xd, vfp, src_address, src_stride, upre(y, y_stride, r, c), y_stride, \
-        second_pred, mask, mask_stride, invert_mask, w, h, &sse);            \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
-    v += thismse;                                                            \
-    if (v < besterr) {                                                       \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    MV this_mv = { r, c };                                                  \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
+                                   pre(y, y_stride, r, c), y_stride, sp(c), \
+                                   sp(r), second_pred, mask, mask_stride,   \
+                                   invert_mask, w, h, &sse);                \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
+    v += thismse;                                                           \
+    if (v < besterr) {                                                      \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 #else
-#define CHECK_BETTER1(v, r, c)                                         \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
-    MV this_mv = { r, c };                                             \
-    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,   \
-                                   upre(y, y_stride, r, c), y_stride,  \
-                                   second_pred, w, h, &sse);           \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
-    v += thismse;                                                      \
-    if (v < besterr) {                                                 \
-      besterr = v;                                                     \
-      br = r;                                                          \
-      bc = c;                                                          \
-      *distortion = thismse;                                           \
-      *sse1 = sse;                                                     \
-    }                                                                  \
-  } else {                                                             \
-    v = INT_MAX;                                                       \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    MV this_mv = { r, c };                                                  \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
+                                   pre(y, y_stride, r, c), y_stride, sp(c), \
+                                   sp(r), second_pred, w, h, &sse);         \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
+    v += thismse;                                                           \
+    if (v < besterr) {                                                      \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 #endif  // CONFIG_EXT_INTER
 
@@ -700,16 +696,14 @@
 };
 /* clang-format on */
 
-static int upsampled_pref_error(const MACROBLOCKD *xd,
-                                const aom_variance_fn_ptr_t *vfp,
-                                const uint8_t *const src, const int src_stride,
-                                const uint8_t *const y, int y_stride,
-                                const uint8_t *second_pred,
+static int upsampled_pref_error(
+    const MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
 #if CONFIG_EXT_INTER
-                                const uint8_t *mask, int mask_stride,
-                                int invert_mask,
+    const uint8_t *mask, int mask_stride, int invert_mask,
 #endif
-                                int w, int h, unsigned int *sse) {
+    int w, int h, unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -717,15 +711,17 @@
     if (second_pred != NULL) {
 #if CONFIG_EXT_INTER
       if (mask)
-        aom_highbd_comp_mask_upsampled_pred(pred16, second_pred, w, h, y,
-                                            y_stride, mask, mask_stride,
-                                            invert_mask);
+        aom_highbd_comp_mask_upsampled_pred(
+            pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride,
+            mask, mask_stride, invert_mask, xd->cur_buf->bit_depth);
       else
 #endif
-        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
-                                           y_stride);
+        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h,
+                                           subpel_x_q3, subpel_y_q3, y,
+                                           y_stride, xd->cur_buf->bit_depth);
     } else {
-      aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+      aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
+                                y_stride, xd->cur_buf->bit_depth);
     }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
@@ -738,13 +734,15 @@
     if (second_pred != NULL) {
 #if CONFIG_EXT_INTER
       if (mask)
-        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, y, y_stride, mask,
+        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
+                                     subpel_y_q3, y, y_stride, mask,
                                      mask_stride, invert_mask);
       else
 #endif
-        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
+        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
+                                    subpel_y_q3, y, y_stride);
     } else {
-      aom_upsampled_pred(pred, w, h, y, y_stride);
+      aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -764,12 +762,12 @@
 #endif
     int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
     int *distortion) {
-  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
-                                              y + offset, y_stride, second_pred,
+  unsigned int besterr = upsampled_pref_error(
+      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred,
 #if CONFIG_EXT_INTER
-                                              mask, mask_stride, invert_mask,
+      mask, mask_stride, invert_mask,
 #endif
-                                              w, h, sse1);
+      w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -824,7 +822,7 @@
 #if CONFIG_EXT_INTER
         mask, mask_stride, invert_mask,
 #endif
-        w, h, (offset * 8), mvjcost, mvcost, sse1, distortion);
+        w, h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr =
         setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
@@ -845,17 +843,15 @@
         MV this_mv = { tr, tc };
 
         if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
           thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                         pre_address, y_stride, second_pred,
+                                         pre(y, y_stride, tr, tc), y_stride,
+                                         sp(tc), sp(tr), second_pred,
 #if CONFIG_EXT_INTER
                                          mask, mask_stride, invert_mask,
 #endif
                                          w, h, &sse);
         } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
+          const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
           if (second_pred == NULL)
             thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
                                src_address, src_stride, &sse);
@@ -894,16 +890,15 @@
       MV this_mv = { tr, tc };
 
       if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
         thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                       pre_address, y_stride, second_pred,
+                                       pre(y, y_stride, tr, tc), y_stride,
+                                       sp(tc), sp(tr), second_pred,
 #if CONFIG_EXT_INTER
                                        mask, mask_stride, invert_mask,
 #endif
                                        w, h, &sse);
       } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
 
         if (second_pred == NULL)
           thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
@@ -2653,19 +2648,20 @@
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 #undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                            \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    thismse = upsampled_obmc_pref_error(                                  \
-        xd, mask, vfp, z, upre(y, y_stride, r, c), y_stride, w, h, &sse); \
-    if ((v = MVC(r, c) + thismse) < besterr) {                            \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    thismse =                                                               \
+        upsampled_obmc_pref_error(xd, mask, vfp, z, pre(y, y_stride, r, c), \
+                                  y_stride, sp(c), sp(r), w, h, &sse);      \
+    if ((v = MVC(r, c) + thismse) < besterr) {                              \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 
 static unsigned int setup_obmc_center_error(
@@ -2684,12 +2680,14 @@
                                      const aom_variance_fn_ptr_t *vfp,
                                      const int32_t *const wsrc,
                                      const uint8_t *const y, int y_stride,
-                                     int w, int h, unsigned int *sse) {
+                                     int subpel_x_q3, int subpel_y_q3, int w,
+                                     int h, unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+    aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
+                              y_stride, xd->cur_buf->bit_depth);
 
     besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
   } else {
@@ -2698,7 +2696,7 @@
   DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   (void)xd;
 #endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, y, y_stride);
+    aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
 #if CONFIG_HIGHBITDEPTH
@@ -2714,18 +2712,17 @@
     int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
     int *distortion) {
   unsigned int besterr = upsampled_obmc_pref_error(
-      xd, mask, vfp, wsrc, y + offset, y_stride, w, h, sse1);
+      xd, mask, vfp, wsrc, y + offset, y_stride, 0, 0, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
 int av1_find_best_obmc_sub_pixel_tree_up(
-    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_upsampled_ref) {
+    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref) {
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   const int *const z = wsrc;
@@ -2756,27 +2753,11 @@
   int y_stride;
   const uint8_t *y;
 
-  const struct buf_2d backup_pred = pd->pre[is_second];
   int minc, maxc, minr, maxr;
 
   av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
                                  ref_mv);
 
-  if (use_upsampled_ref) {
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    int ref = has_second_ref(&xd->mi[0]->mbmi)
-                  ? xd->mi[0]->mbmi.ref_frame[is_second]
-                  : xd->mi[0]->mbmi.ref_frame[0];
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
-    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
-                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
-                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
   y = pd->pre[is_second].buf;
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
@@ -2790,7 +2771,7 @@
   if (use_upsampled_ref)
     besterr = upsampled_setup_obmc_center_error(
         xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
-        (offset * 8), mvjcost, mvcost, sse1, distortion);
+        offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2803,15 +2784,13 @@
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
+        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
 
         if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-          thismse = upsampled_obmc_pref_error(
-              xd, mask, vfp, src_address, pre_address, y_stride, w, h, &sse);
+          thismse =
+              upsampled_obmc_pref_error(xd, mask, vfp, src_address, pre_address,
+                                        y_stride, sp(tc), sp(tr), w, h, &sse);
         } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
           thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, mask, &sse);
         }
@@ -2839,15 +2818,12 @@
       MV this_mv = { tr, tc };
 
       if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
         thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
-                                            pre_address, y_stride, w, h, &sse);
+                                            pre(y, y_stride, tr, tc), y_stride,
+                                            sp(tc), sp(tr), w, h, &sse);
       } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-
-        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            mask, &sse);
+        thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
+                            src_address, mask, &sse);
       }
 
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
@@ -2895,10 +2871,6 @@
   bestmv->row = br;
   bestmv->col = bc;
 
-  if (use_upsampled_ref) {
-    pd->pre[is_second] = backup_pred;
-  }
-
   return besterr;
 }
 
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 7e8b4b2..e013500 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -143,11 +143,10 @@
                                 const aom_variance_fn_ptr_t *fn_ptr,
                                 const MV *ref_mv, MV *dst_mv, int is_second);
 int av1_find_best_obmc_sub_pixel_tree_up(
-    const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_upsampled_ref);
+    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref);
 #endif  // CONFIG_MOTION_VAR
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index a1deed6..6822b0d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -5293,6 +5293,8 @@
   if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
 #endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
+#else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  (void)block;
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -5491,52 +5493,15 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      if (cpi->sf.use_upsampled_references) {
-        // Use up-sampled reference frames.
-        struct buf_2d backup_pred = pd->pre[0];
-        const YV12_BUFFER_CONFIG *upsampled_ref =
-            get_upsampled_ref(cpi, refs[id]);
-
-        // Set pred for Y plane
-        setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                         upsampled_ref->y_crop_width,
-                         upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                         (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                         pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-        if (bsize < BLOCK_8X8)
-          pd->pre[0].buf =
-              &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                       pd->pre[0].stride))
-                              << 3];
-#endif  // !CONFIG_CB4X4
-
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
+      bestsme = cpi->find_fractional_mv_step(
+          x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[bsize], 0,
+          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
 #if CONFIG_EXT_INTER
-            mask, mask_stride, id,
+          mask, mask_stride, id,
 #endif
-            pw, ph, 1);
-
-        // Restore the reference frames.
-        pd->pre[0] = backup_pred;
-      } else {
-        (void)block;
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
-#if CONFIG_EXT_INTER
-            mask, mask_stride, id,
-#endif
-            pw, ph, 0);
-      }
+          pw, ph, cpi->sf.use_upsampled_references);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -6113,17 +6078,6 @@
                                  x->second_best_mv.as_int != x->best_mv.as_int;
           const int pw = block_size_wide[bsize];
           const int ph = block_size_high[bsize];
-          // Use up-sampled reference frames.
-          struct macroblockd_plane *const pd = &xd->plane[0];
-          struct buf_2d backup_pred = pd->pre[ref_idx];
-          const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-          // Set pred for Y plane
-          setup_pred_plane(
-              &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
-              upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
-              upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
-              pd->subsampling_x, pd->subsampling_y);
 
           best_mv_var = cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -6166,9 +6120,6 @@
               x->best_mv.as_mv = best_mv;
             }
           }
-
-          // Restore the reference frames.
-          pd->pre[ref_idx] = backup_pred;
         } else {
           cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -6184,11 +6135,10 @@
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
-            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
-            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
-            cpi->sf.use_upsampled_references);
+            x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
+            x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+            cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
+            &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
         break;
       default: assert("Invalid motion mode!\n");
     }
@@ -6332,10 +6282,12 @@
 
 // Search for the best mv for one component of a compound,
 // given that the other component is fixed.
-static void compound_single_motion_search(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv,
-    int mi_row, int mi_col, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int *rate_mv, const int block, int ref_idx) {
+static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          BLOCK_SIZE bsize, MV *this_mv,
+                                          int mi_row, int mi_col,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          int *rate_mv, int ref_idx) {
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
@@ -6423,43 +6375,11 @@
   if (bestsme < INT_MAX) {
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
-    if (cpi->sf.use_upsampled_references) {
-      // Use up-sampled reference frames.
-      struct buf_2d backup_pred = pd->pre[0];
-      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-      // Set pred for Y plane
-      setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                       upsampled_ref->y_crop_width,
-                       upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                       (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-      if (bsize < BLOCK_8X8)
-        pd->pre[0].buf =
-            &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                     pd->pre[0].stride))
-                            << 3];
-#endif  // !CONFIG_CB4X4
-
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 1);
-
-      // Restore the reference frames.
-      pd->pre[0] = backup_pred;
-    } else {
-      (void)block;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 0);
-    }
+    bestsme = cpi->find_fractional_mv_step(
+        x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+        x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
+        ref_idx, pw, ph, cpi->sf.use_upsampled_references);
   }
 
   // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -6539,7 +6459,7 @@
                           ref_idx, second_pred);
 
   compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
-                                second_pred, mask, mask_stride, rate_mv, block,
+                                second_pred, mask, mask_stride, rate_mv,
                                 ref_idx);
 }
 
@@ -8471,7 +8391,7 @@
           tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
           compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
                                         mi_col, intrapred, mask, bw,
-                                        &tmp_rate_mv, 0, 0);
+                                        &tmp_rate_mv, 0);
           mbmi->mv[0].as_int = tmp_mv.as_int;
           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
                                          bsize);