Add high precision blending obmc

Use high precision intermediate single-ref predictions as for other
compound modes.
This feature is behind CONFIG_OBMC_HIGH_PREC_BLENDING flag.
By default, it is off. To turn it on, apply:
-DCONFIG_OBMC_HIGH_PREC_BLENDING=1

Preliminary results: (20 frames, speed 1, ext_partition and
ext_partition_types disabled)
-0.068% lowres -0.053% midres, -0.08% AWCY

BUG=aomedia:1378

Change-Id: I234d6efa8bcd71cd1f0af3aaa1bf682c47ae75b9
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 99023d3..6500efb 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -117,7 +117,8 @@
 }
 
 static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
-                                                      int plane, int32_t *dst,
+                                                      int plane,
+                                                      CONV_BUF_TYPE *dst,
                                                       int dst_stride,
                                                       int is_compound, int bd) {
   ConvolveParams conv_params;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 87b859a..18e316a 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -645,7 +645,6 @@
       //   if sign requested is 1, we need to return the complement index [1]
       //   instead.
       wedge_params.signflip[w] = (avg < 32);
-      // printf("%d[%d] = %d\n", sb_type, w, wedge_params.signflip[w]);
     }
   }
 }
@@ -1316,6 +1315,33 @@
     av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
 }
 
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+static void setup_dst_plane(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+                            const YV12_BUFFER_CONFIG *src, int mi_row,
+                            int mi_col, int plane) {
+  struct macroblockd_plane *const pd = &planes[plane];
+  const int is_uv = plane > 0;
+  setup_pred_plane(&pd->dst, bsize, src->buffers[plane],
+                   src->crop_widths[is_uv], src->crop_heights[is_uv],
+                   src->strides[is_uv], mi_row, mi_col, NULL, pd->subsampling_x,
+                   pd->subsampling_y);
+}
+
+static void setup_pre_plane(MACROBLOCKD *xd, int idx,
+                            const YV12_BUFFER_CONFIG *src, int mi_row,
+                            int mi_col, const struct scale_factors *sf,
+                            int plane) {
+  if (src != NULL) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int is_uv = plane > 0;
+    setup_pred_plane(&pd->pre[idx], xd->mi[0]->mbmi.sb_type,
+                     src->buffers[plane], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, sf, pd->subsampling_x, pd->subsampling_y);
+  }
+}
+#endif
+
 void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const int num_planes) {
@@ -1389,6 +1415,41 @@
   }
 }
 
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+// dir = 0: vertical filter, dir = 1: horizontal filter
+void av1_get_obmc_mask2d(uint8_t *mask, const int mask_stride, int w, int h,
+                         int dir) {
+  const uint8_t *mask1d;
+
+  switch (dir ? w : h) {
+    case 1: mask1d = obmc_mask_1; break;
+    case 2: mask1d = obmc_mask_2; break;
+    case 4: mask1d = obmc_mask_4; break;
+    case 8: mask1d = obmc_mask_8; break;
+    case 16: mask1d = obmc_mask_16; break;
+    case 32: mask1d = obmc_mask_32; break;
+#if CONFIG_EXT_PARTITION
+    case 64: mask1d = obmc_mask_64; break;
+#endif  // CONFIG_EXT_PARTITION
+    default: assert(0); return;
+  }
+
+  if (dir == 0) {  // Vertical filter
+    for (int r = 0; r < h; ++r) {
+      memset(mask, mask1d[r], w * sizeof(mask[0]));
+      mask += mask_stride;
+    }
+  } else if (dir == 1) {  // Horizontal filter
+    for (int r = 0; r < h; ++r) {
+      memcpy(mask, mask1d, w * sizeof(mask[0]));
+      mask += mask_stride;
+    }
+  } else {
+    assert(0);
+  }
+}
+#endif
+
 static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
                                      uint8_t mi_hw, MODE_INFO *mi,
                                      void *fun_ctxt, const int num_planes) {
@@ -1440,6 +1501,381 @@
   }
 }
 
+void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  if (is_interintra_pred(mbmi)) {
+    mbmi->ref_frame[1] = NONE_FRAME;
+  } else if (has_second_ref(mbmi) &&
+             is_masked_compound_type(mbmi->interinter_compound_type)) {
+    mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+    mbmi->ref_frame[1] = NONE_FRAME;
+  }
+  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
+  return;
+}
+
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+struct obmc_inter_pred_plane_no_round_ctxt {
+  int32_t *base;
+  int base_stride;
+  int32_t *adjacent;
+  int adjacent_stride;
+  int plane;
+};
+
+static INLINE void build_obmc_inter_pred_plane_above_no_round(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MODE_INFO *above_mi, void *fun_ctxt, int num_planes) {
+  (void)above_mi;
+  struct obmc_inter_pred_plane_no_round_ctxt *ctxt =
+      (struct obmc_inter_pred_plane_no_round_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const int overlap =
+      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+  const int plane = ctxt->plane;
+  (void)num_planes;
+
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+  const int bh = overlap >> pd->subsampling_y;
+  const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+  if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) return;
+
+  const int dst_stride = ctxt->base_stride;
+  int32_t *const dst = &ctxt->base[plane_col];
+  const int tmp_stride = ctxt->adjacent_stride;
+  const int32_t *const tmp = &ctxt->adjacent[plane_col];
+  uint8_t mask[MAX_SB_SQUARE];
+  const int mask_stride = bw;
+
+  av1_get_obmc_mask2d(mask, mask_stride, bw, bh, 0);
+  aom_blend_a64_d32_mask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                         mask, mask_stride, bh, bw, 0, 0);
+}
+
+static INLINE void build_obmc_inter_pred_plane_left_no_round(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, MODE_INFO *left_mi,
+    void *fun_ctxt, int num_planes) {
+  (void)left_mi;
+  struct obmc_inter_pred_plane_no_round_ctxt *ctxt =
+      (struct obmc_inter_pred_plane_no_round_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const int overlap =
+      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+  const int plane = ctxt->plane;
+  (void)num_planes;
+
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const int bw = overlap >> pd->subsampling_x;
+  const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+  const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+  if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) return;
+
+  const int dst_stride = ctxt->base_stride;
+  int32_t *const dst = &ctxt->base[plane_row * dst_stride];
+  const int tmp_stride = ctxt->adjacent_stride;
+  const int32_t *const tmp = &ctxt->adjacent[plane_row * tmp_stride];
+  uint8_t mask[MAX_SB_SQUARE];
+  const int mask_stride = bw;
+
+  av1_get_obmc_mask2d(mask, mask_stride, bw, bh, 1);
+  aom_blend_a64_d32_mask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                         mask, mask_stride, bh, bw, 0, 0);
+}
+
+void av1_build_obmc_inter_prediction_plane(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, int mi_row, int mi_col,
+    CONV_BUF_TYPE *base, int base_stride, CONV_BUF_TYPE *above,
+    int above_stride, CONV_BUF_TYPE *left, int left_stride) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+
+  // handle above row
+  struct obmc_inter_pred_plane_no_round_ctxt ctxt_above = {
+    base, base_stride, above, above_stride, plane
+  };
+  foreach_overlappable_nb_above(
+      cm, xd, mi_col, max_neighbor_obmc[b_width_log2_lookup[bsize]],
+      build_obmc_inter_pred_plane_above_no_round, &ctxt_above);
+
+  // handle left column
+  struct obmc_inter_pred_plane_no_round_ctxt ctxt_left = {
+    base, base_stride, left, left_stride, plane
+  };
+  foreach_overlappable_nb_left(
+      cm, xd, mi_row, max_neighbor_obmc[b_height_log2_lookup[bsize]],
+      build_obmc_inter_pred_plane_left_no_round, &ctxt_left);
+
+  setup_dst_plane(xd->plane, xd->mi[0]->mbmi.sb_type, get_frame_new_buffer(cm),
+                  mi_row, mi_col, plane);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, 0, plane, NULL, MAX_SB_SIZE, 1, xd->bd);
+  const int convolve_rounding_bits =
+      FILTER_BITS * 2 - conv_params.round_0 - conv_params.round_1;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  int bw = pd->width;
+  int bh = pd->height;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    av1_highbd_convolve_rounding(base, base_stride, pd->dst.buf, pd->dst.stride,
+                                 bw, bh, convolve_rounding_bits, xd->bd);
+  else
+    av1_convolve_rounding(base, base_stride, pd->dst.buf, pd->dst.stride, bw,
+                          bh, convolve_rounding_bits);
+}
+
+struct build_prediction_plane_hp_ctxt {
+  const AV1_COMMON *cm;
+  int mi_row;
+  int mi_col;
+  CONV_BUF_TYPE *tmp_buf;
+  int tmp_stride;
+  int mb_to_far_edge;
+  int plane;
+};
+
+static INLINE void build_prediction_plane_by_above_pred_hp(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
+  MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  struct build_prediction_plane_hp_ctxt *ctxt =
+      (struct build_prediction_plane_hp_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int32_t *dst[MAX_MB_PLANE];
+  const int plane = ctxt->plane;
+  (void)num_planes;
+
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+  modify_neighbor_predictor_for_obmc(above_mbmi);
+
+  const int num_refs = 1 + has_second_ref(above_mbmi);
+
+  assert(num_refs == 1);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    setup_pre_plane(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
+                    &ref_buf->sf, plane);
+  }
+
+  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+  xd->mb_to_right_edge = ctxt->mb_to_far_edge +
+                         (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+
+  int mi_x = above_mi_col << MI_SIZE_LOG2;
+  int mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+
+  do {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    setup_pred_plane_hp(&dst[plane], a_bsize, ctxt->tmp_buf, ctxt->tmp_stride,
+                        0, rel_mi_col, NULL, pd->subsampling_x,
+                        pd->subsampling_y);
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    av1_build_inter_predictor_hp_sr(xd, plane, above_mi, 1, bw, bh, 0, 0, bw,
+                                    bh, mi_x, mi_y, 0, dst[plane],
+                                    ctxt->tmp_stride);
+  } while (0);
+  *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_plane_by_above_preds_hp(const AV1_COMMON *cm,
+                                                  MACROBLOCKD *xd, int plane,
+                                                  int mi_row, int mi_col,
+                                                  int32_t *tmp_buf,
+                                                  int tmp_stride) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n8_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_plane_hp_ctxt ctxt = {
+    cm, mi_row, mi_col, tmp_buf, tmp_stride, xd->mb_to_right_edge, plane
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                build_prediction_plane_by_above_pred_hp, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_plane_by_left_pred_hp(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, MODE_INFO *left_mi,
+    void *fun_ctxt, const int num_planes) {
+  MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  struct build_prediction_plane_hp_ctxt *ctxt =
+      (struct build_prediction_plane_hp_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int32_t *dst[MAX_MB_PLANE];
+  const int plane = ctxt->plane;
+  (void)num_planes;
+
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+  modify_neighbor_predictor_for_obmc(left_mbmi);
+
+  const int num_refs = 1 + has_second_ref(left_mbmi);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    setup_pre_plane(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
+                    &ref_buf->sf, plane);
+  }
+
+  xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+  xd->mb_to_bottom_edge =
+      ctxt->mb_to_far_edge +
+      (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+
+  int mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  int mi_y = left_mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+
+  do {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    setup_pred_plane_hp(&dst[plane], l_bsize, ctxt->tmp_buf, ctxt->tmp_stride,
+                        rel_mi_row, 0, NULL, pd->subsampling_x,
+                        pd->subsampling_y);
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    av1_build_inter_predictor_hp_sr(xd, plane, left_mi, 1, bw, bh, 0, 0, bw, bh,
+                                    mi_x, mi_y, 0, dst[plane],
+                                    ctxt->tmp_stride);
+  } while (0);
+  *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_plane_by_left_preds_hp(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int plane,
+                                                 int mi_row, int mi_col,
+                                                 int32_t *tmp_buf,
+                                                 int tmp_stride) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n8_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_plane_hp_ctxt ctxt = {
+    cm, mi_row, mi_col, tmp_buf, tmp_stride, xd->mb_to_bottom_edge, plane,
+  };
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               build_prediction_plane_by_left_pred_hp, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_prediction(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    CONV_BUF_TYPE *base[MAX_MB_PLANE], int base_stride[MAX_MB_PLANE],
+    CONV_BUF_TYPE *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE],
+    CONV_BUF_TYPE *left[MAX_MB_PLANE], int left_stride[MAX_MB_PLANE]) {
+  const int num_planes = av1_num_planes(cm);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    av1_build_obmc_inter_prediction_plane(
+        cm, xd, plane, mi_row, mi_col, base[plane], base_stride[plane],
+        above[plane], above_stride[plane], left[plane], left_stride[plane]);
+  }
+}
+
+void av1_build_prediction_by_above_preds_hp(const AV1_COMMON *cm,
+                                            MACROBLOCKD *xd, int mi_row,
+                                            int mi_col,
+                                            int32_t *tmp_buf[MAX_MB_PLANE],
+                                            int tmp_stride[MAX_MB_PLANE]) {
+  const int num_planes = av1_num_planes(cm);
+  if (!xd->up_available) return;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    av1_build_prediction_plane_by_above_preds_hp(
+        cm, xd, plane, mi_row, mi_col, tmp_buf[plane], tmp_stride[plane]);
+  }
+}
+
+void av1_build_prediction_by_left_preds_hp(const AV1_COMMON *cm,
+                                           MACROBLOCKD *xd, int mi_row,
+                                           int mi_col,
+                                           int32_t *tmp_buf[MAX_MB_PLANE],
+                                           int tmp_stride[MAX_MB_PLANE]) {
+  const int num_planes = av1_num_planes(cm);
+  if (!xd->left_available) return;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    av1_build_prediction_plane_by_left_preds_hp(
+        cm, xd, plane, mi_row, mi_col, tmp_buf[plane], tmp_stride[plane]);
+  }
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, CONV_BUF_TYPE, dst_buf0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, CONV_BUF_TYPE, dst_buf1[MAX_SB_SQUARE >> 1]);
+  DECLARE_ALIGNED(16, CONV_BUF_TYPE, dst_buf2[MAX_SB_SQUARE >> 1]);
+
+  int dst_stride0 = MAX_SB_SIZE;
+  int dst_stride1 = MAX_SB_SIZE;
+  int dst_stride2 = MAX_SB_SIZE >> 1;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = pd->width;
+    int bh = pd->height;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+
+    av1_build_inter_predictor_hp_sr(xd, j, xd->mi[0], 0, bw, bh, 0, 0, bw, bh,
+                                    mi_x, mi_y, 0, dst_buf0, dst_stride0);
+    av1_build_prediction_plane_by_above_preds_hp(cm, xd, j, mi_row, mi_col,
+                                                 dst_buf1, dst_stride1);
+    av1_build_prediction_plane_by_left_preds_hp(cm, xd, j, mi_row, mi_col,
+                                                dst_buf2, dst_stride2);
+    av1_build_obmc_inter_prediction_plane(cm, xd, j, mi_row, mi_col, dst_buf0,
+                                          dst_stride0, dst_buf1, dst_stride1,
+                                          dst_buf2, dst_stride2);
+  }
+}
+#else
 struct obmc_inter_pred_ctxt {
   uint8_t **adjacent;
   int *adjacent_stride;
@@ -1540,18 +1976,6 @@
                                build_obmc_inter_pred_left, &ctxt_left);
 }
 
-void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
-  if (is_interintra_pred(mbmi)) {
-    mbmi->ref_frame[1] = NONE_FRAME;
-  } else if (has_second_ref(mbmi) &&
-             is_masked_compound_type(mbmi->interinter_compound_type)) {
-    mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-    mbmi->ref_frame[1] = NONE_FRAME;
-  }
-  if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
-  return;
-}
-
 struct build_prediction_ctxt {
   const AV1_COMMON *cm;
   int mi_row;
@@ -1770,6 +2194,7 @@
   av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
                                   dst_buf2, dst_stride2);
 }
+#endif
 
 /* clang-format off */
 #if CONFIG_EXT_PARTITION
@@ -2088,6 +2513,74 @@
   }
 }
 
+// Build a high precision single ref predictor and store the results in ext_dst.
+// The high precision prediction will be used in high precision blending for
+// compound modes.
+void av1_build_inter_predictor_hp_sr(MACROBLOCKD *xd, int plane,
+                                     const MODE_INFO *mi, int build_for_obmc,
+                                     int bw, int bh, int x, int y, int w, int h,
+                                     int mi_x, int mi_y, int ref,
+                                     CONV_BUF_TYPE *const ext_dst,
+                                     int ext_dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+  const MV mv = mi->mbmi.mv[ref].as_mv;
+  uint8_t *pre;
+  int xs, ys, subpel_x, subpel_y;
+  const int is_scaled = av1_is_scaled(sf);
+  // Set is_compound as 1 to enable high precision output
+  ConvolveParams conv_params = get_conv_params_no_round(
+      ref, 0, plane, ext_dst, ext_dst_stride, 1, xd->bd);
+  WarpTypesAllowed warp_types;
+  const WarpedMotionParams *const wm =
+      &xd->global_motion[mi->mbmi.ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
+
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    xs = ys = SCALE_SUBPEL_SHIFTS;
+    subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+          (x + (mv_q4.col >> SUBPEL_BITS));
+  }
+
+  av1_make_inter_predictor(pre, pre_buf->stride, NULL, 0, subpel_x, subpel_y,
+                           sf, w, h, &conv_params, mi->mbmi.interp_filters,
+                           &warp_types, (mi_x >> pd->subsampling_x) + x,
+                           (mi_y >> pd->subsampling_y) + y, plane, ref, mi,
+                           build_for_obmc, xs, ys, xd);
+}
+
 static void build_wedge_inter_predictor_from_buf(
     MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
     int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index f527b6a..dbeb1d5 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -417,15 +417,45 @@
   return 0;
 }
 
-const uint8_t *av1_get_obmc_mask(int length);
-void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int mi_row, int mi_col);
-void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
-                                     uint8_t *above[MAX_MB_PLANE],
-                                     int above_stride[MAX_MB_PLANE],
-                                     uint8_t *left[MAX_MB_PLANE],
-                                     int left_stride[MAX_MB_PLANE]);
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+static INLINE void setup_pred_plane_hp(int32_t **dst_pt, BLOCK_SIZE bsize,
+                                       int32_t *dst0, int stride0, int mi_row,
+                                       int mi_col,
+                                       const struct scale_factors *scale,
+                                       int subsampling_x, int subsampling_y) {
+  // Offset the buffer pointer
+  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+    mi_row -= 1;
+  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+    mi_col -= 1;
+
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  *dst_pt = dst0 + scaled_buffer_offset(x, y, stride0, scale);
+}
+
+void av1_build_inter_predictor_hp_sr(MACROBLOCKD *xd, int plane,
+                                     const MODE_INFO *mi, int build_for_obmc,
+                                     int bw, int bh, int x, int y, int w, int h,
+                                     int mi_x, int mi_y, int ref,
+                                     int32_t *const ext_dst,
+                                     int ext_dst_stride);
+void av1_build_prediction_by_above_preds_hp(const AV1_COMMON *cm,
+                                            MACROBLOCKD *xd, int mi_row,
+                                            int mi_col,
+                                            int32_t *tmp_buf[MAX_MB_PLANE],
+                                            int tmp_stride[MAX_MB_PLANE]);
+void av1_build_prediction_by_left_preds_hp(const AV1_COMMON *cm,
+                                           MACROBLOCKD *xd, int mi_row,
+                                           int mi_col,
+                                           int32_t *tmp_buf[MAX_MB_PLANE],
+                                           int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_prediction(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    CONV_BUF_TYPE *base[MAX_MB_PLANE], int base_stride[MAX_MB_PLANE],
+    CONV_BUF_TYPE *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE],
+    CONV_BUF_TYPE *left[MAX_MB_PLANE], int left_stride[MAX_MB_PLANE]);
+#else
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
@@ -438,6 +468,17 @@
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+#endif
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col);
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col);
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 687144a..b884465 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -422,11 +422,17 @@
       }
     }
 
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
     if (mbmi->motion_mode == OBMC_CAUSAL) {
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    } else {
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
     }
+#else
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
 
 #if CONFIG_MISMATCH_DEBUG
     for (int plane = 0; plane < num_planes; ++plane) {
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 3cbb8ad..7579213 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -224,8 +224,13 @@
 
   int32_t *wsrc_buf;
   int32_t *mask_buf;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  CONV_BUF_TYPE *above_pred_hp_buf;
+  CONV_BUF_TYPE *left_pred_hp_buf;
+#else
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
+#endif
 
   PALETTE_BUFFER *palette_buffer;
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f039d8a..2c7b87f 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4951,11 +4951,17 @@
                            &xd->block_refs[ref]->sf, num_planes);
     }
 
-    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
-
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
     if (mbmi->motion_mode == OBMC_CAUSAL) {
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    } else {
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
     }
+#else
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
 
 #if CONFIG_MISMATCH_DEBUG
     if (dry_run == OUTPUT_ENABLED) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index cc3d679..2ab88ad 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -527,11 +527,19 @@
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  aom_free(cpi->td.mb.above_pred_hp_buf);
+  cpi->td.mb.above_pred_hp_buf = NULL;
+
+  aom_free(cpi->td.mb.left_pred_hp_buf);
+  cpi->td.mb.left_pred_hp_buf = NULL;
+#else
   aom_free(cpi->td.mb.above_pred_buf);
   cpi->td.mb.above_pred_buf = NULL;
 
   aom_free(cpi->td.mb.left_pred_buf);
   cpi->td.mb.left_pred_buf = NULL;
+#endif
 
   aom_free(cpi->td.mb.wsrc_buf);
   cpi->td.mb.wsrc_buf = NULL;
@@ -3419,7 +3427,18 @@
     av1_init_second_pass(cpi);
   }
 
-  int buf_scaler = 2;
+  int buf_scaler = 1;
+
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  CHECK_MEM_ERROR(cm, cpi->td.mb.above_pred_hp_buf,
+                  (CONV_BUF_TYPE *)aom_memalign(
+                      16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                              sizeof(*cpi->td.mb.above_pred_hp_buf)));
+  CHECK_MEM_ERROR(cm, cpi->td.mb.left_pred_hp_buf,
+                  (CONV_BUF_TYPE *)aom_memalign(
+                      16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                              sizeof(*cpi->td.mb.left_pred_hp_buf)));
+#else
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.above_pred_buf,
       (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
@@ -3428,6 +3447,7 @@
       cm, cpi->td.mb.left_pred_buf,
       (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
                                       sizeof(*cpi->td.mb.left_pred_buf)));
+#endif
 
   CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
                   (int32_t *)aom_memalign(
@@ -3945,8 +3965,13 @@
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
       aom_free(thread_data->td->palette_buffer);
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+      aom_free(thread_data->td->above_pred_hp_buf);
+      aom_free(thread_data->td->left_pred_hp_buf);
+#else
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
+#endif
       aom_free(thread_data->td->wsrc_buf);
       aom_free(thread_data->td->mask_buf);
       aom_free(thread_data->td->counts);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 0c5af2b..92782a1 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -364,8 +364,13 @@
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
   int32_t *wsrc_buf;
   int32_t *mask_buf;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  int32_t *above_pred_hp_buf;
+  int32_t *left_pred_hp_buf;
+#else
   uint8_t *above_pred_buf;
   uint8_t *left_pred_buf;
+#endif
   PALETTE_BUFFER *palette_buffer;
 #if CONFIG_INTRABC
   int intrabc_used_this_tile;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index cf7b465..5dbccc7 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -86,6 +86,18 @@
         av1_setup_pc_tree(cm, thread_data->td);
 
         int buf_scaler = 2;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->above_pred_hp_buf,
+            (CONV_BUF_TYPE *)aom_memalign(
+                16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                        sizeof(*thread_data->td->above_pred_hp_buf)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->left_pred_hp_buf,
+            (CONV_BUF_TYPE *)aom_memalign(
+                16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                        sizeof(*thread_data->td->left_pred_hp_buf)));
+#else
         CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                         (uint8_t *)aom_memalign(
                             16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
@@ -94,6 +106,8 @@
                         (uint8_t *)aom_memalign(
                             16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
                                     sizeof(*thread_data->td->left_pred_buf)));
+#endif
+
         CHECK_MEM_ERROR(
             cm, thread_data->td->wsrc_buf,
             (int32_t *)aom_memalign(
@@ -137,8 +151,14 @@
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+      thread_data->td->mb.above_pred_hp_buf =
+          thread_data->td->above_pred_hp_buf;
+      thread_data->td->mb.left_pred_hp_buf = thread_data->td->left_pred_hp_buf;
+#else
       thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
       thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+#endif
       thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
       thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
     }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 3dabf35..5b9c392 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -7224,11 +7224,18 @@
 }
 
 typedef struct {
-  // Inter prediction buffers and respective strides
+// OBMC secondary prediction buffers and respective strides
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  CONV_BUF_TYPE *above_pred_hp_buf[MAX_MB_PLANE];
+  int above_pred_hp_stride[MAX_MB_PLANE];
+  CONV_BUF_TYPE *left_pred_hp_buf[MAX_MB_PLANE];
+  int left_pred_hp_stride[MAX_MB_PLANE];
+#else
   uint8_t *above_pred_buf[MAX_MB_PLANE];
   int above_pred_stride[MAX_MB_PLANE];
   uint8_t *left_pred_buf[MAX_MB_PLANE];
   int left_pred_stride[MAX_MB_PLANE];
+#endif
   int_mv *single_newmv;
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
@@ -7582,13 +7589,36 @@
         tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
         mbmi->interp_filters =
             condition_interp_filters_on_mv(mbmi->interp_filters, xd);
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-      } else {
-        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       }
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+      DECLARE_ALIGNED(16, CONV_BUF_TYPE, tmp_buf[MAX_MB_PLANE * MAX_SB_SQUARE]);
+      CONV_BUF_TYPE *dst_buf[MAX_MB_PLANE];
+      int dst_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      dst_buf[0] = tmp_buf;
+      dst_buf[1] = tmp_buf + MAX_SB_SQUARE;
+      dst_buf[2] = tmp_buf + MAX_SB_SQUARE * 2;
+
+      for (int j = 0; j < num_planes; ++j) {
+        const struct macroblockd_plane *pd = &xd->plane[j];
+        int bw = pd->width;
+        int bh = pd->height;
+        const int mi_x = mi_col * MI_SIZE;
+        const int mi_y = mi_row * MI_SIZE;
+
+        av1_build_inter_predictor_hp_sr(xd, j, xd->mi[0], 0, bw, bh, 0, 0, bw,
+                                        bh, mi_x, mi_y, 0, dst_buf[j],
+                                        dst_stride[j]);
+      }
+      av1_build_obmc_inter_prediction(
+          cm, xd, mi_row, mi_col, dst_buf, dst_stride, args->above_pred_hp_buf,
+          args->above_pred_hp_stride, args->left_pred_hp_buf,
+          args->left_pred_hp_stride);
+#else
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
           args->left_pred_buf, args->left_pred_stride);
+#endif
     }
 
     // Local warped motion mode
@@ -9005,11 +9035,20 @@
                            plane_block_height);
 }
 
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const CONV_BUF_TYPE *above,
+                                      int above_stride,
+                                      const CONV_BUF_TYPE *left,
+                                      int left_stride);
+#else
 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       const MACROBLOCKD *xd, int mi_row,
                                       int mi_col, const uint8_t *above,
                                       int above_stride, const uint8_t *left,
                                       int left_stride);
+#endif
 
 #if CONFIG_EXT_SKIP
 static void estimate_skip_mode_rdcost(
@@ -9212,6 +9251,8 @@
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
   int skip_intra_modes = 0;
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
 
   HandleInterModeArgs args = {
     { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
@@ -9221,19 +9262,19 @@
     { { 0 } },
   };
 
-  const int rows = block_size_high[bsize];
-  const int cols = block_size_wide[bsize];
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  args.above_pred_hp_buf[0] = x->above_pred_hp_buf;
+  args.above_pred_hp_buf[1] = x->above_pred_hp_buf + MAX_SB_SQUARE;
+  args.above_pred_hp_buf[2] = x->above_pred_hp_buf + 2 * MAX_SB_SQUARE;
+  args.left_pred_hp_buf[0] = x->left_pred_hp_buf;
+  args.left_pred_hp_buf[1] = x->left_pred_hp_buf + MAX_SB_SQUARE;
+  args.left_pred_hp_buf[2] = x->left_pred_hp_buf + 2 * MAX_SB_SQUARE;
+#else
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
-  int64_t dist_refs[TOTAL_REFS_PER_FRAME];
-  int dist_order_refs[TOTAL_REFS_PER_FRAME];
-  int num_available_refs = 0;
-  memset(dist_refs, -1, sizeof(dist_refs));
-  memset(dist_order_refs, -1, sizeof(dist_order_refs));
-
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
     args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
@@ -9254,6 +9295,13 @@
     args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
     args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
   }
+#endif
+
+  int64_t dist_refs[TOTAL_REFS_PER_FRAME];
+  int dist_order_refs[TOTAL_REFS_PER_FRAME];
+  int num_available_refs = 0;
+  memset(dist_refs, -1, sizeof(dist_refs));
+  memset(dist_order_refs, -1, sizeof(dist_order_refs));
 
   av1_zero(best_mbmode);
   av1_zero(pmi_uv);
@@ -9321,6 +9369,20 @@
 
   if (check_num_overlappable_neighbors(mbmi) &&
       is_motion_variation_allowed_bsize(bsize)) {
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+    av1_build_prediction_by_above_preds_hp(cm, xd, mi_row, mi_col,
+                                           args.above_pred_hp_buf,
+                                           args.above_pred_hp_stride);
+    av1_build_prediction_by_left_preds_hp(cm, xd, mi_row, mi_col,
+                                          args.left_pred_hp_buf,
+                                          args.left_pred_hp_stride);
+    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                         mi_col, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args.above_pred_hp_buf[0],
+        args.above_pred_hp_stride[0], args.left_pred_hp_buf[0],
+        args.left_pred_hp_stride[0]);
+#else
     av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
                                         args.above_pred_buf, dst_width1,
                                         dst_height1, args.above_pred_stride);
@@ -9332,6 +9394,7 @@
     calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
                               args.above_pred_stride[0], args.left_pred_buf[0],
                               args.left_pred_stride[0]);
+#endif
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -10299,12 +10362,17 @@
     }
 
     if (is_inter_mode(mbmi->mode)) {
-      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
       if (mbmi->motion_mode == OBMC_CAUSAL) {
-        av1_build_obmc_inter_prediction(
-            cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
-            args.left_pred_buf, args.left_pred_stride);
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+      } else {
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
       }
+#else
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
       av1_subtract_plane(x, bsize, 0);
       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
         // av1_rd_pick_inter_mode_sb
@@ -10849,7 +10917,11 @@
 
 struct calc_target_weighted_pred_ctxt {
   const MACROBLOCK *x;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  const CONV_BUF_TYPE *tmp;
+#else
   const uint8_t *tmp;
+#endif
   int tmp_stride;
   int overlap;
 };
@@ -10863,16 +10935,21 @@
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  const CONV_BUF_TYPE *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+#else
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif
 
+#if !CONFIG_OBMC_HIGH_PREC_BLENDING
   if (!is_hbd) {
+#endif
     for (int row = 0; row < ctxt->overlap; ++row) {
       const uint8_t m0 = mask1d[row];
       const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
@@ -10884,6 +10961,7 @@
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
+#if !CONFIG_OBMC_HIGH_PREC_BLENDING
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -10899,6 +10977,7 @@
       tmp16 += ctxt->tmp_stride;
     }
   }
+#endif
 }
 
 static INLINE void calc_target_weighted_pred_left(
@@ -10910,16 +10989,22 @@
   struct calc_target_weighted_pred_ctxt *ctxt =
       (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
 
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
 
   int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
   int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  const CONV_BUF_TYPE *tmp =
+      ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+#else
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif
 
+#if !CONFIG_OBMC_HIGH_PREC_BLENDING
   if (!is_hbd) {
+#endif
     for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
       for (int col = 0; col < ctxt->overlap; ++col) {
         const uint8_t m0 = mask1d[col];
@@ -10932,6 +11017,7 @@
       mask += bw;
       tmp += ctxt->tmp_stride;
     }
+#if !CONFIG_OBMC_HIGH_PREC_BLENDING
   } else {
     const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
@@ -10948,6 +11034,7 @@
       tmp16 += ctxt->tmp_stride;
     }
   }
+#endif
 }
 
 // This function has a structure similar to av1_build_obmc_inter_prediction
@@ -10988,19 +11075,37 @@
 //  error(x, y) =
 //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
 //
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const CONV_BUF_TYPE *above,
+                                      int above_stride,
+                                      const CONV_BUF_TYPE *left,
+                                      int left_stride) {
+#else
 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       const MACROBLOCKD *xd, int mi_row,
                                       int mi_col, const uint8_t *above,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
+#endif
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
 
-  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, 0, 0, NULL, MAX_SB_SIZE, 1, xd->bd);
+  const int convolve_rounding_bits =
+      FILTER_BITS * 2 - conv_params.round_0 - conv_params.round_1;
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA *
+                        (1 << convolve_rounding_bits);
+#else
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+#endif
 
   // plane 0 should not be subsampled
   assert(xd->plane[0].subsampling_x == 0);
@@ -11041,7 +11146,12 @@
 
     for (int row = 0; row < bh; ++row) {
       for (int col = 0; col < bw; ++col) {
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+        wsrc_buf[col] = ROUND_POWER_OF_TWO_SIGNED(
+            src[col] * src_scale - wsrc_buf[col], convolve_rounding_bits);
+#else
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+#endif
       }
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
@@ -11051,7 +11161,12 @@
 
     for (int row = 0; row < bh; ++row) {
       for (int col = 0; col < bw; ++col) {
+#if CONFIG_OBMC_HIGH_PREC_BLENDING
+        wsrc_buf[col] = ROUND_POWER_OF_TWO_SIGNED(
+            src[col] * src_scale - wsrc_buf[col], convolve_rounding_bits);
+#else
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+#endif
       }
       wsrc_buf += bw;
       src += x->plane[0].src.stride;
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 6eb7240..1de2529 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -152,3 +152,4 @@
 set(CONFIG_TMV 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_TXK_SEL 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_FILEOPTIONS 1 CACHE NUMBER "AV1 config option flag.")
+set(CONFIG_OBMC_HIGH_PREC_BLENDING 0 CACHE NUMBER "AV1 config option flag.")