Do sub-pixel motion search in up-sampled reference frames Up-sampled the reference frames to 8 times in each dimension using the 8-tap interpolation filter. In sub-pixel motion search, use the up-sampled reference frames to find the best matching blocks. This largely improved the motion search precision, and thus, improved the compression quality. There was no change in decoder side. Borg test and speed test results: 1. On derflr set, Overall PSNR gain: 1.306%, and SSIM gain: 1.512%. Average speed loss on derf set was 6.0%. 2. On stdhd set, Overall PSNR gain: 0.754%, and SSIM gain: 0.814%. On hevchd set, Overall PSNR gain: 0.465%, and SSIM gain: 0.527%. Speed loss on HD clips was 3.5%. Change-Id: I300ebaafff57e88914f3dedc8784cb21d316b04f

diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index f6ae6c0..fc9e2e9 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c

@@ -410,6 +410,15 @@
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+#if CONFIG_AFFINE_MOTION
+  {
+    // Free up-sampled reference buffers.
+    int i;
+    for (i = 0; i < MAX_REF_FRAMES; i++)
+      vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
+  }
+#endif
+
   vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp10_free_postproc_buffers(cm);
@@ -744,6 +753,26 @@
                                NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
+
+#if CONFIG_AFFINE_MOTION
+  {
+    // Allocate up-sampled reference buffers.
+    int i;
+
+    for (i = 0; i < MAX_REF_FRAMES; i++)
+      if (vpx_realloc_frame_buffer(&cpi->upsampled_ref_bufs[i].buf,
+                                   (cm->width << 3), (cm->height << 3),
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                   cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+            "Failed to allocate up-sampled reference frame buffer");
+  }
+#endif
 }
 
 
@@ -2353,10 +2382,11 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int bd) {
+                                   YV12_BUFFER_CONFIG *dst, int planes,
+                                   int bd) {
 #else
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst) {
+                                   YV12_BUFFER_CONFIG *dst, int planes) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
@@ -2374,7 +2404,7 @@
 
   for (y = 0; y < dst_h; y += 16) {
     for (x = 0; x < dst_w; x += 16) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
+      for (i = 0; i < planes; ++i) {
         const int factor = (i == 0 || i == 3 ? 1 : 2);
         const int x_q4 = x * (16 / factor) * src_w / dst_w;
         const int y_q4 = y * (16 / factor) * src_h / dst_h;
@@ -2391,13 +2421,13 @@
                                &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                                16 / factor, 16 / factor, bd);
         } else {
-          vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                         &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
                         &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                         16 / factor, 16 / factor);
         }
 #else
-        vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                       &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
                       &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
@@ -2406,7 +2436,10 @@
     }
   }
 
-  vpx_extend_frame_borders(dst);
+  if (planes == 1)
+    vpx_extend_frame_borders_y(dst);
+  else
+    vpx_extend_frame_borders(dst);
 }
 
 static int scale_down(VP10_COMP *cpi, int q) {
@@ -2462,6 +2495,45 @@
   return force_recode;
 }
 
+#if CONFIG_AFFINE_MOTION
+static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    if (!ubufs[i].ref_count) {
+      return i;
+    }
+  }
+  return INVALID_IDX;
+}
+
+// Up-sample reference frames.
+static INLINE int upsample_ref_frame(RefCntBuffer *bufs,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     EncRefCntBuffer *ubufs, int new_idx,
+                                     int bit_depth) {
+#else
+                                     EncRefCntBuffer *ubufs, int new_idx) {
+#endif
+  int new_uidx = get_free_upsampled_ref_buf(ubufs);
+
+  if (new_uidx == INVALID_IDX) {
+    return INVALID_IDX;
+  } else {
+    const YV12_BUFFER_CONFIG *const ref = &bufs[new_idx].buf;
+    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
+
+    // Currently, only Y plane is up-sampled, U, V are not used.
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(ref, upsampled_ref, 1, bit_depth);
+#else
+    scale_and_extend_frame(ref, upsampled_ref, 1);
+#endif
+    return new_uidx;
+  }
+}
+#endif
+
 void vp10_update_reference_frames(VP10_COMP *cpi) {
   VP10_COMMON * const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
@@ -2469,6 +2541,17 @@
   int ref_frame;
 #endif  // CONFIG_EXT_REFS
 
+#if CONFIG_AFFINE_MOTION
+  // Always up-sample the current encoded frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
+                                    cm->new_fb_idx, (int)cm->bit_depth);
+#else
+  int new_uidx = upsample_ref_frame(pool->frame_bufs, cpi->upsampled_ref_bufs,
+                                    cm->new_fb_idx);
+#endif
+#endif
+
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
@@ -2476,6 +2559,13 @@
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
   } else if (vp10_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
@@ -2489,7 +2579,10 @@
 
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
@@ -2503,6 +2596,10 @@
 
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+#endif
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
@@ -2511,6 +2608,10 @@
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#endif
       if (!cpi->rc.is_src_frame_alt_ref)
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
@@ -2545,6 +2646,10 @@
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+#if CONFIG_AFFINE_MOTION
+    uref_cnt_fb(cpi->upsampled_ref_bufs,
+                &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+#endif
     if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
@@ -2678,7 +2783,8 @@
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -2703,11 +2809,39 @@
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_AFFINE_MOTION
+        {
+          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+          EncRefCntBuffer *ubuf =
+              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
+
+          if (vpx_realloc_frame_buffer(&ubuf->buf,
+                                       (cm->width << 3), (cm->height << 3),
+                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate up-sampled frame buffer");
+#if CONFIG_VP9_HIGHBITDEPTH
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
+#else
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, MAX_MB_PLANE);
+#endif
+          cpi->scaled_ref_idx[ref_frame - LAST_FRAME] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -3787,6 +3921,17 @@
   }
 }
 
+#if CONFIG_AFFINE_MOTION
+static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    cpi->upsampled_ref_bufs[i].ref_count = 0;
+    cpi->upsampled_ref_idx[i] = INVALID_IDX;
+  }
+}
+#endif
+
 static void check_initial_width(VP10_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 int use_highbitdepth,
@@ -3809,7 +3954,9 @@
     alloc_raw_frame_buffers(cpi);
     init_ref_frame_bufs(cm);
     alloc_util_frame_buffers(cpi);
-
+#if CONFIG_AFFINE_MOTION
+    init_upsampled_ref_frame_bufs(cpi);
+#endif
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;

diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 292494c..2c158a4 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h

@@ -286,6 +286,13 @@
   double worst;
 } ImageStat;
 
+#if CONFIG_AFFINE_MOTION
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+#endif
+
 typedef struct VP10_COMP {
   QUANTS quants;
   ThreadData td;
@@ -304,6 +311,12 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
+#if CONFIG_AFFINE_MOTION
+  // Up-sampled reference buffers
+  EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES];
+  int upsampled_ref_idx[MAX_REF_FRAMES];
+#endif
+
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
@@ -692,4 +705,18 @@
 }  // extern "C"
 #endif
 
+#if CONFIG_AFFINE_MOTION
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+                               int new_uidx) {
+  const int ref_index = *uidx;
+
+  if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+    ubufs[ref_index].ref_count--;
+
+  *uidx = new_uidx;
+  ubufs[new_uidx].ref_count++;
+}
+#endif
+
 #endif  // VP10_ENCODER_ENCODER_H_

diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c
index 2d3a33e..1f467b8 100644
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c

@@ -64,7 +64,11 @@
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list),
         NULL, NULL,
+#if CONFIG_AFFINE_MOTION
+        &distortion, &sse, NULL, 0, 0, 0);
+#else
         &distortion, &sse, NULL, 0, 0);
+#endif
   }
 
 #if CONFIG_EXT_INTER

diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 6e3b06a..8949f76 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c

@@ -208,6 +208,32 @@
     v = INT_MAX;                                                       \
   }
 
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#if CONFIG_AFFINE_MOTION
+static INLINE const uint8_t *upre(const uint8_t *buf, int stride,
+                                  int r, int c) {
+  return &buf[(r) * stride + (c)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_pref_error(xd, vfp, z, src_stride,             \
+                                   upre(y, y_stride, r, c), y_stride,  \
+                                   second_pred, w, h, &sse);           \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+#endif
+
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
     unsigned int left, right, up, down, diag;           \
@@ -276,7 +302,7 @@
 // TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
 // SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
 // later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST                        \
+#define SECOND_LEVEL_CHECKS_BEST(k)                     \
   {                                                     \
     unsigned int second;                                \
     int br0 = br;                                       \
@@ -287,10 +313,10 @@
     } else if (tr != br && tc == bc) {                  \
       kr = br - tr;                                     \
     }                                                   \
-    CHECK_BETTER(second, br0 + kr, bc0);                \
-    CHECK_BETTER(second, br0, bc0 + kc);                \
+    CHECK_BETTER##k(second, br0 + kr, bc0);             \
+    CHECK_BETTER##k(second, br0, bc0 + kc);             \
     if (br0 != br || bc0 != bc) {                       \
-      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc);      \
     }                                                   \
   }
 
@@ -412,7 +438,11 @@
     int *distortion,
     unsigned int *sse1,
     const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+    int w, int h, int use_upsampled_ref) {
+#else
     int w, int h) {
+#endif
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
@@ -425,6 +455,9 @@
   (void) allow_hp;
   (void) forced_stop;
   (void) hstep;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
 
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -491,8 +524,17 @@
                                              int *distortion,
                                              unsigned int *sse1,
                                              const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                             int w, int h,
+                                             int use_upsampled_ref) {
+#else
                                              int w, int h) {
+#endif
   SETUP_SUBPEL_SEARCH;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
                                w, h, offset, mvjcost, mvcost,
@@ -565,8 +607,16 @@
                                         int *distortion,
                                         unsigned int *sse1,
                                         const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                        int w, int h, int use_upsampled_ref) {
+#else
                                         int w, int h) {
+#endif
   SETUP_SUBPEL_SEARCH;
+#if CONFIG_AFFINE_MOTION
+  (void) use_upsampled_ref;
+#endif
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                z, src_stride, y, y_stride, second_pred,
                                w, h, offset, mvjcost, mvcost,
@@ -655,6 +705,101 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
+
+#if CONFIG_AFFINE_MOTION
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred,
+                                           const uint8_t *pred8,
+                                           int width, int height,
+                                           const uint8_t *ref8,
+                                           int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[(j << 3)];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += stride;
+  }
+}
+
+static void highbd_upsampled_pred(uint16_t *comp_pred,
+                                  int width, int height,
+                                  const uint8_t *ref8,
+                                  int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ref[(j << 3)];
+    }
+    comp_pred += width;
+    ref += stride;
+  }
+}
+#endif
+
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+                                const vp9_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                const uint8_t *second_pred,
+                                int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    if (second_pred != NULL)
+      highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                     y_stride);
+    else
+      highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                      sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (second_pred != NULL)
+      vpx_comp_avg_upsampled_pred(pred, second_pred, w, h, y,
+                                  y_stride);
+    else
+      vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride, const uint8_t *second_pred,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
+                                              y + offset, y_stride, second_pred,
+                                              w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+#endif
+
 int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -667,14 +812,18 @@
                                  int *distortion,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+                                 int w, int h, int use_upsampled_ref) {
+#else
                                  int w, int h) {
+#endif
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
-  int thismse;
+  unsigned int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -703,10 +852,19 @@
   bestmv->row *= 8;
   bestmv->col *= 8;
 
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
-                               sse1, distortion);
+#if CONFIG_AFFINE_MOTION
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit,
+                                           vfp, z, src_stride, y, y_stride,
+                                           second_pred, w, h, (offset << 3),
+                                           mvjcost, mvcost, sse1, distortion);
+  else
+#endif
+    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                                 z, src_stride, y, y_stride, second_pred,
+                                 w, h, offset, mvjcost, mvcost,
+                                 sse1, distortion);
 
   (void) cost_list;  // to silence compiler warning
 
@@ -716,16 +874,29 @@
       tr = br + search_step[idx].row;
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-        MV this_mv;
-        this_mv.row = tr;
-        this_mv.col = tc;
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                             src_address, src_stride, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+        MV this_mv = {tr, tc};
+
+#if CONFIG_AFFINE_MOTION
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                         pre_address, y_stride, second_pred,
+                                         w, h, &sse);
+        } else {
+#endif
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+#if CONFIG_AFFINE_MOTION
+        }
+#endif
+
         cost_array[idx] = thismse +
             mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -747,14 +918,29 @@
     tc = bc + kc;
     tr = br + kr;
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
       MV this_mv = {tr, tc};
-      if (second_pred == NULL)
-        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                           src_address, src_stride, &sse);
-      else
-        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                            src_address, src_stride, &sse, second_pred);
+
+#if CONFIG_AFFINE_MOTION
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                       pre_address, y_stride, second_pred,
+                                       w, h, &sse);
+      } else {
+#endif
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+#if CONFIG_AFFINE_MOTION
+      }
+#endif
+
       cost_array[4] = thismse +
           mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -776,8 +962,17 @@
       bc = tc;
     }
 
-    if (iters_per_step > 1 && best_idx != -1)
-      SECOND_LEVEL_CHECKS_BEST;
+    if (iters_per_step > 1 && best_idx != -1) {
+#if CONFIG_AFFINE_MOTION
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+#endif
+        SECOND_LEVEL_CHECKS_BEST(0);
+#if CONFIG_AFFINE_MOTION
+      }
+#endif
+    }
 
     tr = br;
     tc = bc;

diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index 9d1ab2a..3063b99 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h

@@ -116,7 +116,11 @@
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
+#if CONFIG_AFFINE_MOTION
+    int w, int h, int use_upsampled_ref);
+#else
     int w, int h);
+#endif
 
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 03aa9f0..5c74d32 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -3929,7 +3929,8 @@
                                 int_mv* ref_mv_sub8x8[2],
 #endif
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv,
+                                const int block) {
   const VP10_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -4076,6 +4077,40 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
+#if CONFIG_AFFINE_MOTION
+      // Use up-sampled reference frames.
+      struct macroblockd_plane *const pd = &xd->plane[0];
+      struct buf_2d backup_pred = pd->pre[0];
+      const YV12_BUFFER_CONFIG *upsampled_ref =
+          get_upsampled_ref(cpi, refs[id]);
+
+      // Set pred for Y plane
+      setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                       upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                       NULL, pd->subsampling_x, pd->subsampling_y);
+
+      // If bsize < BLOCK_8X8, adjust pred pointer for this block
+      if (bsize < BLOCK_8X8)
+        pd->pre[0].buf =
+            &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
+            pd->pre[0].stride)) << 3];
+
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph, 1);
+
+      // Restore the reference frames.
+      pd->pre[0] = backup_pred;
+#else
+      (void) block;
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
@@ -4087,6 +4122,7 @@
           x->nmvjointcost, x->mvcost,
           &dis, &sse, second_pred,
           pw, ph);
+#endif
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -4367,6 +4403,43 @@
 
           if (bestsme < INT_MAX) {
             int distortion;
+#if CONFIG_AFFINE_MOTION
+            const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+            const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+            // Use up-sampled reference frames.
+            struct macroblockd_plane *const pd = &xd->plane[0];
+            struct buf_2d backup_pred = pd->pre[0];
+            const YV12_BUFFER_CONFIG *upsampled_ref =
+                get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+
+            // Set pred for Y plane
+            setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                             upsampled_ref->y_stride,
+                             (mi_row << 3), (mi_col << 3),
+                             NULL, pd->subsampling_x, pd->subsampling_y);
+
+            // adjust pred pointer for this block
+            pd->pre[0].buf =
+                &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
+                pd->pre[0].stride)) << 3];
+
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, pw, ph, 1);
+
+            // Restore the reference frames.
+            pd->pre[0] = backup_pred;
+#else
             cpi->find_fractional_mv_step(
                 x,
                 new_mv,
@@ -4380,6 +4453,7 @@
                 &distortion,
                 &x->pred_sse[mbmi->ref_frame[0]],
                 NULL, 0, 0);
+#endif
 
             // save motion search result for use in compound prediction
 #if CONFIG_EXT_INTER
@@ -4426,7 +4500,7 @@
 #else
                                 seg_mvs[i],
 #endif  // CONFIG_EXT_INTER
-                                &rate_mv);
+                                &rate_mv, i);
 #if CONFIG_EXT_INTER
             compound_seg_newmvs[i][0].as_int =
                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
@@ -4975,6 +5049,33 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
+#if CONFIG_AFFINE_MOTION
+    const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+    // Use up-sampled reference frames.
+    struct macroblockd_plane *const pd = &xd->plane[0];
+    struct buf_2d backup_pred = pd->pre[0];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+    // Set pred for Y plane
+    setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+
+    bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                           cm->allow_high_precision_mv,
+                                           x->errorperbit,
+                                           &cpi->fn_ptr[bsize],
+                                           cpi->sf.mv.subpel_force_stop,
+                                           cpi->sf.mv.subpel_iters_per_step,
+                                           cond_cost_list(cpi, cost_list),
+                                           x->nmvjointcost, x->mvcost,
+                                           &dis, &x->pred_sse[ref], NULL,
+                                           pw, ph, 1);
+
+    // Restore the reference frames.
+    pd->pre[0] = backup_pred;
+#else
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
@@ -4984,6 +5085,7 @@
                                  cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
+#endif
   }
   *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -5328,7 +5430,7 @@
 
         if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
           joint_motion_search(cpi, x, bsize, frame_mv,
-                              mi_row, mi_col, NULL, single_newmv, &rate_mv);
+                              mi_row, mi_col, NULL, single_newmv, &rate_mv, 0);
         } else {
           rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                       &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -5358,7 +5460,7 @@
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         joint_motion_search(cpi, x, bsize, frame_mv,
                             mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, 0);
       } else {
         rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,

diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 066bf69..74702a9 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h

@@ -106,4 +106,20 @@
 }  // extern "C"
 #endif
 
+#if CONFIG_AFFINE_MOTION
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
+                                                          const int ref) {
+  // Use up-sampled reference frames.
+  int ref_idx = 0;
+  if (ref == LAST_FRAME)
+    ref_idx = cpi->lst_fb_idx;
+  else if (ref == GOLDEN_FRAME)
+    ref_idx = cpi->gld_fb_idx;
+  else if (ref == ALTREF_FRAME)
+    ref_idx = cpi->alt_fb_idx;
+
+  return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
+}
+#endif
+
 #endif  // VP10_ENCODER_RDOPT_H_

diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index d16e4a4..3e1246a 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c

@@ -320,7 +320,11 @@
                                          0, mv_sf->subpel_iters_per_step,
                                          cond_cost_list(cpi, cost_list),
                                          NULL, NULL,
+#if CONFIG_AFFINE_MOTION
+                                         &distortion, &sse, NULL, 0, 0, 0);
+#else
                                          &distortion, &sse, NULL, 0, 0);
+#endif
 
   // Restore input state
   x->plane[0].src = src;