Refactor iteration over neighbours for OBMC

There are six pieces of code in reconinter.c and two in rdopt.c which
iterate over the blocks along the top or left edge of the current
block for OBMC. Before this patch, each bit of code has its own
implementation of the iteration, which is reasonably finicky to get
right.

This patch factors out that logic into a pair of helpers
(foreach_overlappable_nb_above and foreach_overlappable_nb_left). The
functions take a "fun" parameter, which contains the loop body. Note
that the iteration is too complicated for us to be able to define a
macro that could be used like

  FOREACH_NB_ABOVE(rel_pos, nb_size, nb_mi) { ... }

While C's syntax doesn't seem to let you do that, once the compiler's
optimisation pass is done inlining everything, the results are
essentially the same.

The iteration logic is also slightly generalised: the old code checked
whether a block was shorter or narrower than 8 pixels by comparing a
block size with BLOCK_8X8. This doesn't work when you have a 4x16 or
16x4 block because e.g. BLOCK_16X4 is not less than BLOCK_8X8. This
generalisation is (unsurprisingly) needed in order to to support 16x4
or 4x16 blocks.

This patch doesn't address the CONFIG_NCOBMC functions in reconinter.c
that do prediction from right and bottom edges.

This patch shouldn't affect the generated bitstream in any way: the
code is supposed to be equivalent.

Change-Id: I9e5a116b012c18645604a7d98fb98be99697d363
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 1b44434..560bfd4 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -118,6 +118,10 @@
 AV1_COMMON_SRCS-yes += common/cfl.c
 endif
 
+ifeq ($(CONFIG_MOTION_VAR),yes)
+AV1_COMMON_SRCS-yes += common/obmc.h
+endif
+
 ifeq ($(CONFIG_PVQ),yes)
 # PVQ from daala
 AV1_COMMON_SRCS-yes += common/pvq.c
diff --git a/av1/common/obmc.h b/av1/common/obmc.h
new file mode 100644
index 0000000..68d43c9
--- /dev/null
+++ b/av1/common/obmc.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_COMMON_OBMC_H_
+#define AV1_COMMON_OBMC_H_
+
+#if CONFIG_MOTION_VAR
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
+                                          uint8_t nb_mi_size, MODE_INFO *nb_mi,
+                                          void *fun_ctxt);
+
+static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int mi_col,
+                                                 int nb_max,
+                                                 overlappable_nb_visitor_t fun,
+                                                 void *fun_ctxt) {
+  if (!xd->up_available) return;
+
+  int nb_count = 0;
+
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
+       above_mi_col += mi_step) {
+    MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step = mi_size_wide[above_mi[0]->mbmi.sb_type];
+#if CONFIG_CHROMA_SUB8X8
+    // If we're considering a block with width 4, it should be treated as
+    // half of a pair of blocks with chroma information in the second. Move
+    // above_mi_col back to the start of the pair if needed, set above_mbmi
+    // to point at the block with chroma information, and set mi_step to 2 to
+    // step over the entire pair at the end of the iteration.
+    if (mi_step == 1) {
+      above_mi_col &= ~1;
+      above_mi = prev_row_mi + above_mi_col + 1;
+      mi_step = 2;
+    }
+#endif  // CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO *above_mbmi = &above_mi[0]->mbmi;
+    if (is_neighbor_overlappable(above_mbmi)) {
+      ++nb_count;
+      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
+          fun_ctxt);
+    }
+  }
+}
+
+static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+                                                MACROBLOCKD *xd, int mi_row,
+                                                int nb_max,
+                                                overlappable_nb_visitor_t fun,
+                                                void *fun_ctxt) {
+  if (!xd->left_available) return;
+
+  int nb_count = 0;
+
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
+       left_mi_row += mi_step) {
+    MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step = mi_size_high[left_mi[0]->mbmi.sb_type];
+#if CONFIG_CHROMA_SUB8X8
+    if (mi_step == 1) {
+      left_mi_row &= ~1;
+      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
+      mi_step = 2;
+    }
+#endif  // CONFIG_CHROMA_SUB8X8
+    MB_MODE_INFO *left_mbmi = &left_mi[0]->mbmi;
+    if (is_neighbor_overlappable(left_mbmi)) {
+      ++nb_count;
+      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
+          fun_ctxt);
+    }
+  }
+}
+
+#endif  // CONFIG_MOTION_VAR
+#endif  // AV1_COMMON_OBMC_H_
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 7cdaff2..ccc3dd2 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -11,6 +11,7 @@
 
 #include <assert.h>
 #include <stdio.h>
+#include <limits.h>
 
 #include "./aom_scale_rtcd.h"
 #include "./aom_dsp_rtcd.h"
@@ -24,6 +25,7 @@
 #include "av1/common/reconintra.h"
 #if CONFIG_MOTION_VAR
 #include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -2104,62 +2106,29 @@
 }
 #endif  // CONFIG_NCOBMC
 
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
+                                     uint8_t mi_hw, MODE_INFO *mi,
+                                     void *fun_ctxt) {
+  (void)xd;
+  (void)rel_mi_rc;
+  (void)mi_hw;
+  (void)mi;
+  ++*(int *)fun_ctxt;
+}
+
 void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col) {
-  int i, mi_step;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
-  xd->mi[0]->mbmi.overlappable_neighbors[0] = 0;
-  xd->mi[0]->mbmi.overlappable_neighbors[1] = 0;
+  mbmi->overlappable_neighbors[0] = 0;
+  mbmi->overlappable_neighbors[1] = 0;
 
   if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
 
-  if (xd->up_available) {
-    const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-    for (i = 0; i < ilimit; i += mi_step) {
-      int mi_row_offset = -1;
-      int mi_col_offset = i;
-      MODE_INFO *above_mi =
-          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (above_mbmi->sb_type < BLOCK_8X8) {
-        ++mi_col_offset;
-        above_mbmi =
-            &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      }
-#endif
-      BLOCK_SIZE above_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      mi_step = AOMMIN(xd->n8_w, mi_size_wide[above_bsize]);
-
-      if (is_neighbor_overlappable(above_mbmi))
-        xd->mi[0]->mbmi.overlappable_neighbors[0]++;
-    }
-  }
-
-  if (xd->left_available) {
-    const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-    for (i = 0; i < ilimit; i += mi_step) {
-      int mi_row_offset = i;
-      int mi_col_offset = -1;
-      MODE_INFO *left_mi =
-          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
-
-#if CONFIG_CHROMA_SUB8X8
-      if (left_mbmi->sb_type < BLOCK_8X8) {
-        ++mi_row_offset;
-        left_mbmi =
-            &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      }
-#endif
-      BLOCK_SIZE left_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      mi_step = AOMMIN(xd->n8_h, mi_size_high[left_bsize]);
-
-      if (is_neighbor_overlappable(left_mbmi))
-        xd->mi[0]->mbmi.overlappable_neighbors[1]++;
-    }
-  }
+  foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+                                &mbmi->overlappable_neighbors[0]);
+  foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+                               &mbmi->overlappable_neighbors[1]);
 }
 
 // HW does not support < 4x4 prediction. To limit the bandwidth requirement, for
@@ -2191,7 +2160,88 @@
   }
 }
 
-// This function combines motion compensated predictions that is generated by
+struct obmc_inter_pred_ctxt {
+  uint8_t **adjacent;
+  int *adjacent_stride;
+};
+
+static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
+                                               uint8_t above_mi_width,
+                                               MODE_INFO *above_mi,
+                                               void *fun_ctxt) {
+  (void)above_mi;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_HIGHBITDEPTH
+  const int overlap =
+      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    const int bh = overlap >> pd->subsampling_y;
+    const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_col];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
+    const uint8_t *const mask = av1_get_obmc_mask(bh);
+
+#if CONFIG_HIGHBITDEPTH
+    if (is_hbd)
+      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bh, bw, xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bh, bw);
+  }
+}
+
+static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
+                                              uint8_t left_mi_height,
+                                              MODE_INFO *left_mi,
+                                              void *fun_ctxt) {
+  (void)left_mi;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const int overlap =
+      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = overlap >> pd->subsampling_x;
+    const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+    const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
+    const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+#if CONFIG_HIGHBITDEPTH
+    if (is_hbd)
+      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bh, bw, xd->bd);
+    else
+#endif  // CONFIG_HIGHBITDEPTH
+      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bh, bw);
+  }
+}
+
+// This function combines motion compensated predictions that are generated by
 // top/left neighboring blocks' inter predictors with the regular inter
 // prediction. We assume the original prediction (bmc) is stored in
 // xd->plane[].dst.buf
@@ -2202,131 +2252,18 @@
                                      uint8_t *left[MAX_MB_PLANE],
                                      int left_stride[MAX_MB_PLANE]) {
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int plane, i;
-#if CONFIG_HIGHBITDEPTH
-  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-#endif  // CONFIG_HIGHBITDEPTH
 
   // handle above row
-  if (xd->up_available) {
-    const int overlap =
-        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
-    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-    const int mi_row_offset = -1;
-    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(miw > 0);
-
-    i = 0;
-    do {  // for each mi in the above row
-      int mi_col_offset = i;
-      MB_MODE_INFO *above_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (above_mbmi->sb_type < BLOCK_8X8) {
-        ++mi_col_offset;
-        above_mbmi =
-            &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      }
-#endif
-
-      const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-      const int above_step =
-          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_w, above_step);
-
-      if (is_neighbor_overlappable(above_mbmi)) {
-        neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-          const struct macroblockd_plane *pd = &xd->plane[plane];
-          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-          const int bh = overlap >> pd->subsampling_y;
-
-          if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-
-          const int dst_stride = pd->dst.stride;
-          uint8_t *const dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
-          const int tmp_stride = above_stride[plane];
-          const uint8_t *const tmp =
-              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
-          const uint8_t *const mask = av1_get_obmc_mask(bh);
-
-#if CONFIG_HIGHBITDEPTH
-          if (is_hbd)
-            aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                       tmp_stride, mask, bh, bw, xd->bd);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                tmp_stride, mask, bh, bw);
-        }
-      }
-      i += mi_step;
-    } while (i < miw);
-  }
+  struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                build_obmc_inter_pred_above, &ctxt_above);
 
   // handle left column
-  if (xd->left_available) {
-    const int overlap =
-        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
-    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-    const int mi_col_offset = -1;
-    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(mih > 0);
-
-    i = 0;
-    do {  // for each mi in the left column
-      int mi_row_offset = i;
-      MB_MODE_INFO *left_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (left_mbmi->sb_type < BLOCK_8X8) {
-        ++mi_row_offset;
-        left_mbmi =
-            &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      }
-#endif
-
-      const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-      const int left_step =
-          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_h, left_step);
-
-      if (is_neighbor_overlappable(left_mbmi)) {
-        neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-          const struct macroblockd_plane *pd = &xd->plane[plane];
-          const int bw = overlap >> pd->subsampling_x;
-          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-
-          if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-
-          const int dst_stride = pd->dst.stride;
-          uint8_t *const dst =
-              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
-          const int tmp_stride = left_stride[plane];
-          const uint8_t *const tmp =
-              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
-          const uint8_t *const mask = av1_get_obmc_mask(bw);
-
-#if CONFIG_HIGHBITDEPTH
-          if (is_hbd)
-            aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                       tmp_stride, mask, bh, bw, xd->bd);
-          else
-#endif  // CONFIG_HIGHBITDEPTH
-            aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                tmp_stride, mask, bh, bw);
-        }
-      }
-      i += mi_step;
-    } while (i < mih);
-  }
+  struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               build_obmc_inter_pred_left, &ctxt_left);
 }
 
 void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
@@ -2351,109 +2288,181 @@
   return;
 }
 
+struct build_prediction_ctxt {
+  const AV1_COMMON *cm;
+  int mi_row;
+  int mi_col;
+  uint8_t **tmp_buf;
+  int *tmp_width;
+  int *tmp_height;
+  int *tmp_stride;
+  int mb_to_far_edge;
+};
+
+static INLINE void build_prediction_by_above_pred(MACROBLOCKD *xd,
+                                                  int rel_mi_col,
+                                                  uint8_t above_mi_width,
+                                                  MODE_INFO *above_mi,
+                                                  void *fun_ctxt) {
+  MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+  modify_neighbor_predictor_for_obmc(above_mbmi);
+
+  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int num_refs = 1 + is_inter_anyref_comp_mode(above_mbmi->mode);
+#else
+  const int num_refs = 1 + has_second_ref(above_mbmi);
+#endif
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
+                                         ? above_mbmi->ref_frame[ref]
+                                         : above_mbmi->ref_frame[0];
+#else
+    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
+                         &ref_buf->sf);
+  }
+
+  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+  xd->mb_to_right_edge = ctxt->mb_to_far_edge +
+                         (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+
+  int mi_x = above_mi_col << MI_SIZE_LOG2;
+  int mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+
+  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw,
+                           bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                           0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+  *above_mbmi = backup_mbmi;
+}
+
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
                                          int tmp_width[MAX_MB_PLANE],
                                          int tmp_height[MAX_MB_PLANE],
                                          int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-  int mb_to_right_edge_base = xd->mb_to_right_edge;
-  const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
-  int neighbor_count = 0;
-
-  if (mi_row <= tile->mi_row_start) return;
+  if (mi_row <= xd->tile.mi_row_start) return;
 
   xd->mb_to_bottom_edge += xd->n8_h / 2 * MI_SIZE * 8;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
 
-#if CONFIG_CHROMA_SUB8X8
-    if (above_mbmi->sb_type < BLOCK_8X8) {
-      ++mi_col_offset;
-      above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      above_mbmi = &above_mi->mbmi;
-    }
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= xd->n8_h / 2 * MI_SIZE * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(MACROBLOCKD *xd,
+                                                 int rel_mi_row,
+                                                 uint8_t left_mi_height,
+                                                 MODE_INFO *left_mi,
+                                                 void *fun_ctxt) {
+  MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+  modify_neighbor_predictor_for_obmc(left_mbmi);
+
+  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int num_refs = 1 + is_inter_anyref_comp_mode(left_mbmi->mode);
+#else
+  const int num_refs = 1 + has_second_ref(left_mbmi);
 #endif
 
-    const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-    MB_MODE_INFO backup_mbmi;
-
-    const int above_step =
-        AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
-    mi_step = AOMMIN(xd->n8_w, above_step);
-
-    if (!is_neighbor_overlappable(above_mbmi)) continue;
-
-    neighbor_count++;
-    if (neighbor_count > neighbor_limit) break;
-
-    backup_mbmi = *above_mbmi;
-    modify_neighbor_predictor_for_obmc(above_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, a_bsize, tmp_buf[j], tmp_width[j],
-                       tmp_height[j], tmp_stride[j], 0, i, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
+  for (int ref = 0; ref < num_refs; ++ref) {
 #if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(above_mbmi->mode));
-         ++ref)
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref)
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-      const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
-                                           ? above_mbmi->ref_frame[ref]
-                                           : above_mbmi->ref_frame[0];
+    const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
+                                         ? left_mbmi->ref_frame[ref]
+                                         : left_mbmi->ref_frame[0];
 #else
-      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
 #endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
 
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
-                           &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    xd->mb_to_right_edge =
-        mb_to_right_edge_base + (xd->n8_w - i - mi_step) * MI_SIZE * 8;
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = mi_row << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-      bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
-                  4);
-      bh = AOMMIN(bh, block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
-
-      if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
-      build_inter_predictors(cm, xd, j, above_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                             0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                             mi_x, mi_y);
-    }
-    *above_mbmi = backup_mbmi;
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
+                         &ref_buf->sf);
   }
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = mb_to_right_edge_base;
-  xd->mb_to_bottom_edge -= xd->n8_h / 2 * MI_SIZE * 8;
+
+  xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+  xd->mb_to_bottom_edge =
+      ctxt->mb_to_far_edge +
+      (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+
+  int mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  int mi_y = left_mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+
+  for (int j = 0; j < MAX_MB_PLANE; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                           0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+  *left_mbmi = backup_mbmi;
 }
 
 void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -2462,100 +2471,22 @@
                                         int tmp_width[MAX_MB_PLANE],
                                         int tmp_height[MAX_MB_PLANE],
                                         int tmp_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-  const int ilimit = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-  int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
-  const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
-  int neighbor_count = 0;
-
-  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) return;
+  if (mi_col <= xd->tile.mi_col_start) return;
 
   xd->mb_to_right_edge += xd->n8_w / 2 * MI_SIZE * 8;
-  for (i = 0; i < ilimit; i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
 
-#if CONFIG_CHROMA_SUB8X8
-    if (left_mbmi->sb_type < BLOCK_8X8) {
-      ++mi_row_offset;
-      left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-      left_mbmi = &left_mi->mbmi;
-    }
-#endif
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                               build_prediction_by_left_pred, &ctxt);
 
-    const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-    MB_MODE_INFO backup_mbmi;
-    const int left_step =
-        AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
-    mi_step = AOMMIN(xd->n8_h, left_step);
-
-    if (!is_neighbor_overlappable(left_mbmi)) continue;
-
-    neighbor_count++;
-    if (neighbor_count > neighbor_limit) break;
-
-    backup_mbmi = *left_mbmi;
-    modify_neighbor_predictor_for_obmc(left_mbmi);
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, l_bsize, tmp_buf[j], tmp_width[j],
-                       tmp_height[j], tmp_stride[j], i, 0, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(left_mbmi->mode)); ++ref)
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
-    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref)
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-    {
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-      const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
-                                           ? left_mbmi->ref_frame[ref]
-                                           : left_mbmi->ref_frame[0];
-#else
-      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-      const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!av1_is_valid_scale(&ref_buf->sf)))
-        aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
-                           &ref_buf->sf);
-    }
-
-    xd->mb_to_top_edge = -(((mi_row + i) * MI_SIZE) * 8);
-    xd->mb_to_bottom_edge =
-        mb_to_bottom_edge_base + (xd->n8_h - i - mi_step) * MI_SIZE * 8;
-    mi_x = mi_col << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
-                  4);
-      bw = AOMMIN(bw, block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
-      build_inter_predictors(cm, xd, j, left_mi, 1, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                             0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                             mi_x, mi_y);
-    }
-    *left_mbmi = backup_mbmi;
-  }
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
   xd->mb_to_right_edge -= xd->n8_w / 2 * MI_SIZE * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
 }
 
 void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 272ac57..b63c24c 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -27,6 +27,7 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/idct.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
@@ -12286,6 +12287,124 @@
 }
 
 #if CONFIG_MOTION_VAR
+
+struct calc_target_weighted_pred_ctxt {
+  const MACROBLOCK *x;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
+                                                   int rel_mi_col,
+                                                   uint8_t nb_mi_width,
+                                                   MODE_INFO *nb_mi,
+                                                   void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
+                                                  int rel_mi_row,
+                                                  uint8_t nb_mi_height,
+                                                  MODE_INFO *nb_mi,
+                                                  void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+
+  if (!is_hbd) {
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
 // This function has a structure similar to av1_build_obmc_inter_prediction
 //
 // The OBMC predictor is computed as:
@@ -12330,13 +12449,11 @@
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int row, col, i;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
-  const int wsrc_stride = bw;
-  const int mask_stride = bw;
+
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 #if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
@@ -12349,86 +12466,20 @@
   assert(xd->plane[0].subsampling_y == 0);
 
   av1_zero_array(wsrc_buf, bw * bh);
-  for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
 
   // handle above row
   if (xd->up_available) {
     const int overlap =
-        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
-    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-    const int mi_row_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(miw > 0);
-
-    i = 0;
-    do {  // for each mi in the above row
-      const int mi_col_offset = i;
-      const MB_MODE_INFO *above_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (above_mbmi->sb_type < BLOCK_8X8)
-        above_mbmi =
-            &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      const int above_step =
-          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_w, above_step);
-      const int neighbor_bw = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(above_mbmi)) {
-        if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = above_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
-        int32_t *mask = mask_buf + (i * MI_SIZE);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = above;
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      above += neighbor_bw;
-      i += mi_step;
-    } while (i < miw);
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+                                                   overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+                                  max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
   }
 
-  for (i = 0; i < bw * bh; ++i) {
+  for (int i = 0; i < bw * bh; ++i) {
     wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
     mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
   }
@@ -12436,102 +12487,33 @@
   // handle left column
   if (xd->left_available) {
     const int overlap =
-        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
-    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-    const int mi_col_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(mih > 0);
-
-    i = 0;
-    do {  // for each mi in the left column
-      const int mi_row_offset = i;
-      MB_MODE_INFO *left_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-
-#if CONFIG_CHROMA_SUB8X8
-      if (left_mbmi->sb_type < BLOCK_8X8)
-        left_mbmi =
-            &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      const int left_step =
-          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_h, left_step);
-      const int neighbor_bh = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(left_mbmi)) {
-        if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = left_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
-        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = left;
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      left += neighbor_bh * left_stride;
-      i += mi_step;
-    } while (i < mih);
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+                                                   overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+                                 max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
   }
 
   if (!is_hbd) {
     const uint8_t *src = x->plane[0].src.buf;
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #endif  // CONFIG_HIGHBITDEPTH