Cherry-pick CWG-F049 and DAMR LF_SUB_PU for chroma

Fix DAMR chroma subblock logics in CONFIG_LF_SUB_PU (issue #623).

Fix TIP subblock logics in 422 format (issue #423).

Noise level performance impact.

STATS_CHANGED
diff --git a/aom_dsp/x86/aom_convolve_copy_avx2.c b/aom_dsp/x86/aom_convolve_copy_avx2.c
index 1306163..28bae6e 100644
--- a/aom_dsp/x86/aom_convolve_copy_avx2.c
+++ b/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -250,6 +250,24 @@
       h -= 2;
     } while (h);
 #endif  // CONFIG_SUBBLK_REF_EXT
+  } else if (w == 24) {
+    do {
+      __m256i s[2];
+      __m128i s_rem[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      s_rem[0] = _mm_loadu_si128((__m128i *)(src + 16));
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      s_rem[1] = _mm_loadu_si128((__m128i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      _mm_storeu_si128((__m128i *)(dst + 16), s_rem[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      _mm_storeu_si128((__m128i *)(dst + 16), s_rem[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
   } else if (w == 32) {
     do {
       __m256i s[4];
diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.c b/aom_dsp/x86/aom_convolve_copy_sse2.c
index bd32344..cf6fb21 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.c
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -303,6 +303,27 @@
       h -= 2;
     } while (h);
 #endif  // CONFIG_SUBBLK_REF_EXT
+  } else if (w == 24) {
+    do {
+      __m128i s[6];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      src += src_stride;
+      s[3] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[4] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      src += src_stride;
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[2]);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)(dst + 0 * 8), s[3]);
+      _mm_storeu_si128((__m128i *)(dst + 1 * 8), s[4]);
+      _mm_storeu_si128((__m128i *)(dst + 2 * 8), s[5]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
   } else if (w == 32) {
     do {
       __m128i s[8];
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 02dabe5..5ced281 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -358,6 +358,7 @@
   AOM_AV1_COMMON_INTRIN_AVX2
   "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
   "${AOM_ROOT}/av1/common/x86/affine_optflow_refine_avx2.c"
+  "${AOM_ROOT}/av1/common/x86/bawp_avx2.c"
   "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_ccso_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index f38011e..6d05236 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -529,13 +529,12 @@
 #if CONFIG_LF_SUB_PU
 // Check whether current block is TIP mode
 static AOM_INLINE void check_tip_edge(const MB_MODE_INFO *const mbmi,
-                                      const int scale_horz,
-                                      const int scale_vert, TX_SIZE *ts,
+                                      const int scale, TX_SIZE *ts,
                                       int32_t *tip_edge) {
   const bool is_tip_mode = is_tip_ref_frame(mbmi->ref_frame[0]);
   if (is_tip_mode) {
     *tip_edge = 1;
-    const int tip_ts = (scale_horz || scale_vert) ? TX_4X4 : TX_8X8;
+    const int tip_ts = scale ? TX_4X4 : TX_8X8;
     *ts = tip_ts;
   }
 }
@@ -547,13 +546,22 @@
                                        const MACROBLOCKD *xd,
 #endif  // CONFIG_COMPOUND_4XN
                                        const MB_MODE_INFO *const mbmi,
-                                       TX_SIZE *ts, int32_t *opfl_edge) {
-  if (plane > 0) return;
+                                       const int scale, TX_SIZE *ts,
+                                       int32_t *opfl_edge) {
   const bool is_opfl_mode = opfl_allowed_for_cur_block(cm,
 #if CONFIG_COMPOUND_4XN
                                                        xd,
 #endif  // CONFIG_COMPOUND_4XN
                                                        mbmi);
+#if CONFIG_AFFINE_REFINEMENT
+  if (is_opfl_mode && plane &&
+      mbmi->comp_refine_type >= COMP_AFFINE_REFINE_START) {
+    *opfl_edge = 1;
+    *ts = scale ? TX_4X4 : TX_8X8;
+    return;
+  }
+#endif  // CONFIG_AFFINE_REFINEMENT
+  if (plane > 0) return;
   if (is_opfl_mode) {
     *opfl_edge = 1;
     const int opfl_ts = TX_8X8;
@@ -564,14 +572,13 @@
 #if CONFIG_REFINEMV
 // Check whether current block is RFMV mode
 static AOM_INLINE void check_rfmv_edge(const MB_MODE_INFO *const mbmi,
-                                       const int scale_horz,
-                                       const int scale_vert, TX_SIZE *ts,
+                                       const int scale, TX_SIZE *ts,
                                        int32_t *rfmv_edge) {
   const int is_rfmv_mode =
       mbmi->refinemv_flag && !is_tip_ref_frame(mbmi->ref_frame[0]);
   if (is_rfmv_mode) {
     *rfmv_edge = 1;
-    const int rfmv_ts = (scale_horz || scale_vert) ? TX_8X8 : TX_16X16;
+    const int rfmv_ts = scale ? TX_8X8 : TX_16X16;
     *ts = rfmv_ts;
   }
 }
@@ -592,16 +599,16 @@
   int temp_edge = 0;
   TX_SIZE temp_ts = 0;
 
-  check_tip_edge(mbmi, scale_horz, scale_vert, &temp_ts, &temp_edge);
+  int scale = edge_dir == VERT_EDGE ? scale_horz : scale_vert;
+  check_tip_edge(mbmi, scale, &temp_ts, &temp_edge);
   if (!temp_edge)
     check_opfl_edge(cm, plane,
 #if CONFIG_COMPOUND_4XN
                     xd,
 #endif  // CONFIG_COMPOUND_4XN
-                    mbmi, &temp_ts, &temp_edge);
+                    mbmi, scale, &temp_ts, &temp_edge);
 #if CONFIG_REFINEMV
-  if (!temp_edge)
-    check_rfmv_edge(mbmi, scale_horz, scale_vert, &temp_ts, &temp_edge);
+  if (!temp_edge) check_rfmv_edge(mbmi, scale, &temp_ts, &temp_edge);
 #endif  // CONFIG_REFINEMV
 
   if (temp_edge) {
@@ -1355,34 +1362,40 @@
   const uint16_t q_vert = lfi->tip_q_thr[plane][VERT_EDGE];
   const uint16_t side_vert = lfi->tip_side_thr[plane][VERT_EDGE];
   const int bit_depth = cm->seq_params.bit_depth;
-  int n = 8;
+  int sub_bw = 8;
+  int sub_bh = 8;
   if (plane > 0) {
     const int subsampling_x = cm->seq_params.subsampling_x;
     const int subsampling_y = cm->seq_params.subsampling_y;
-    if (subsampling_x || subsampling_y) n = 4;
+    sub_bw >>= subsampling_x;
+    sub_bh >>= subsampling_y;
   }
-  const int filter_length = n;
+  // select vert/horz filter lengths based on block width/height
+  int filter_length_vert = sub_bw;
+  int filter_length_horz = sub_bh;
 
   // start filtering
-  const int h = bh - n;
-  const int w = bw - n;
-  const int rw = bw - (bw % n);
-  for (int j = 0; j <= h; j += n) {
-    for (int i = 0; i <= w; i += n) {
+  const int h = bh - sub_bh;
+  const int w = bw - sub_bw;
+  const int rw = bw - (bw % sub_bw);
+  for (int j = 0; j <= h; j += sub_bh) {
+    for (int i = 0; i <= w; i += sub_bw) {
       // filter vertical boundary
       if (i > 0) {
-        aom_highbd_lpf_vertical_generic_c(dst, dst_stride, filter_length,
-                                          &q_vert, &side_vert, bit_depth, n);
+        aom_highbd_lpf_vertical_generic_c(dst, dst_stride, filter_length_vert,
+                                          &q_vert, &side_vert, bit_depth,
+                                          sub_bh);
       }
       // filter horizontal boundary
       if (j > 0) {
-        aom_highbd_lpf_horizontal_generic_c(dst, dst_stride, filter_length,
-                                            &q_horz, &side_horz, bit_depth, n);
+        aom_highbd_lpf_horizontal_generic_c(dst, dst_stride, filter_length_horz,
+                                            &q_horz, &side_horz, bit_depth,
+                                            sub_bw);
       }
-      dst += n;
+      dst += sub_bw;
     }
     dst -= rw;
-    dst += n * dst_stride;
+    dst += sub_bh * dst_stride;
   }
 }
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 5c133fc..e05f8b7 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -351,6 +351,14 @@
     specialize qw/av1_avg_pooling_pdiff_gradients avx2/;
 }
 
+#
+# Block Adaptive Weighted Prediction
+#
+if (aom_config("CONFIG_BAWP") eq "yes"){
+  add_proto qw/void av1_make_bawp_block/, "uint16_t *dst, int dst_stride, int16_t alpha, int32_t beta, int shift, int bw, int bh, int bd";
+  specialize qw/av1_make_bawp_block avx2/;
+}
+
 # Helper functions.
 add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
 specialize "av1_round_shift_array", qw/sse4_1 neon/;
@@ -703,7 +711,7 @@
 
 if (aom_config("CONFIG_OPFL_MEMBW_REDUCTION") eq "yes"){
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta, int use_damr_padding, ReferenceArea *ref_area";
-  specialize qw/av1_highbd_warp_affine sse4_1/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
 }
 else{
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 2f8e409..572d8d5 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -344,19 +344,21 @@
 #if CONFIG_SUBBLK_REF_EXT
 #define REF_BUFFER_WIDTH                                                   \
   (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \
-   2 * SUBBLK_REF_EXT_LINES)
+   2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))
 #else
-#define REF_BUFFER_WIDTH \
-  (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND)
+#define REF_BUFFER_WIDTH                                                   \
+  (REFINEMV_SUBBLOCK_WIDTH + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \
+   2 * DMVR_SEARCH_EXT_LINES)
 #endif
 #endif  // CONFIG_ACROSS_SCALE_REFINEMV
 #if CONFIG_SUBBLK_REF_EXT
 #define REF_BUFFER_HEIGHT                                                   \
   (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \
-   2 * SUBBLK_REF_EXT_LINES)
+   2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))
 #else
-#define REF_BUFFER_HEIGHT \
-  (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND)
+#define REF_BUFFER_HEIGHT                                                   \
+  (REFINEMV_SUBBLOCK_HEIGHT + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND + \
+   2 * DMVR_SEARCH_EXT_LINES)
 #endif  // CONFIG_SUBBLK_REF_EXT
 typedef struct PadBlock {
   int x0;
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 39d2c9b..32e217e 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -51,6 +51,12 @@
 #define SUBBLK_REF_EXT_LINES 2
 #endif  // CONFIG_SUBBLK_REF_EXT
 
+#if CONFIG_16_FULL_SEARCH_DMVR
+#define DMVR_SEARCH_EXT_LINES 2
+#else
+#define DMVR_SEARCH_EXT_LINES 0
+#endif  // CONFIG_16_FULL_SEARCH_DMVR
+
 #if CONFIG_WARP_PRECISION
 #define WARP_STATS_BUFFER_SIZE \
   (MAX_WARP_REF_CANDIDATES * NUM_WARP_PRECISION_MODES)
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 77aabc2..f45c51a 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1187,20 +1187,20 @@
 
 #if CONFIG_REFINEMV
 // Compute the SAD values for refineMV modes
-int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int width, int height,
-                     int bd) {
+int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int stride, int width,
+                     int height, int bd) {
 #if CONFIG_SUBBLK_REF_EXT
   (void)bd;
 #if CONFIG_SUBBLK_REF_DS
-  return get_highbd_sad_ds(src1, width, src2, width, 8, width, height);
+  return get_highbd_sad_ds(src1, stride, src2, stride, 8, width, height);
 #else
-  return get_highbd_sad(src1, width, src2, width, 8, width, height);
+  return get_highbd_sad(src1, stride, src2, stride, 8, width, height);
 #endif
 #else
 #if CONFIG_SUBBLK_REF_DS
-  return get_highbd_sad_ds(src1, width, src2, width, bd, width, height);
+  return get_highbd_sad_ds(src1, stride, src2, stride, bd, width, height);
 #else
-  return get_highbd_sad(src1, width, src2, width, bd, width, height);
+  return get_highbd_sad(src1, stride, src2, stride, bd, width, height);
 #endif  // CONFIG_SUBBLK_REF_DS
 #endif  // CONFIG_SUBBLK_REF_EXT
 }
@@ -3688,6 +3688,17 @@
   }
 }
 
+// Generate weighted prediction of the block.
+void av1_make_bawp_block_c(uint16_t *dst, int dst_stride, int16_t alpha,
+                           int32_t beta, int shift, int bw, int bh, int bd) {
+  for (int j = 0; j < bh; ++j) {
+    for (int i = 0; i < bw; ++i) {
+      dst[j * dst_stride + i] = clip_pixel_highbd(
+          (dst[j * dst_stride + i] * alpha + beta) >> shift, bd);
+    }
+  }
+}
+
 // generate inter prediction of a block coded in bwap mode enabled
 void av1_build_one_bawp_inter_predictor(
     uint16_t *dst, int dst_stride, const MV *const src_mv,
@@ -3853,12 +3864,7 @@
 
   int16_t alpha = mbmi->bawp_alpha[plane][ref];
   int32_t beta = mbmi->bawp_beta[plane][ref];
-  for (int j = 0; j < bh; ++j) {
-    for (int i = 0; i < bw; ++i) {
-      dst[j * dst_stride + i] = clip_pixel_highbd(
-          (dst[j * dst_stride + i] * alpha + beta) >> shift, xd->bd);
-    }
-  }
+  av1_make_bawp_block(dst, dst_stride, alpha, beta, shift, bw, bh, xd->bd);
 }
 #endif  // CONFIG_BAWP
 
@@ -4552,6 +4558,33 @@
   }
 }
 
+#if CONFIG_16_FULL_SEARCH_DMVR
+void av1_refinemv_build_predictors(MACROBLOCKD *xd, int mi_x, int mi_y,
+                                   uint16_t **mc_buf,
+                                   CalcSubpelParamsFunc calc_subpel_params_func,
+                                   uint16_t *dst_ref0, uint16_t *dst_ref1,
+                                   int dst_stride, MV mv0, MV mv1,
+                                   InterPredParams *inter_pred_params) {
+  for (int ref = 0; ref < 2; ref++) {
+    SubpelParams subpel_params;
+    uint16_t *src;
+    int src_stride;
+
+    uint16_t *dst_ref = ref == 0 ? dst_ref0 : dst_ref1;
+    MV *src_mv = ref == 0 ? &mv0 : &mv1;
+#if CONFIG_SUBBLK_REF_EXT
+    src_mv->row -= 8 * SUBBLK_REF_EXT_LINES;
+    src_mv->col -= 8 * SUBBLK_REF_EXT_LINES;
+#endif  // CONFIG_SUBBLK_REF_EXT
+    calc_subpel_params_func(src_mv, &inter_pred_params[ref], xd, mi_x, mi_y,
+                            ref, 0, mc_buf, &src, &subpel_params, &src_stride);
+    assert(inter_pred_params[ref].comp_mode == UNIFORM_SINGLE ||
+           inter_pred_params[ref].comp_mode == UNIFORM_COMP);
+    av1_make_inter_predictor(src, src_stride, dst_ref, dst_stride,
+                             &inter_pred_params[ref], &subpel_params);
+  }
+}
+#else
 int av1_refinemv_build_predictors_and_get_sad(
     MACROBLOCKD *xd, int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf,
     CalcSubpelParamsFunc calc_subpel_params_func, uint16_t *dst_ref0,
@@ -4574,8 +4607,10 @@
                              &inter_pred_params[ref], &subpel_params);
   }
 
-  return get_refinemv_sad(dst_ref0, dst_ref1, bw, bh, xd->bd);
+  return get_refinemv_sad(dst_ref0, dst_ref1, bw, bw, bh, xd->bd);
 }
+#endif  // CONFIG_16_FULL_SEARCH_DMVR
+
 void apply_mv_refinement(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
                          MB_MODE_INFO *mi, int bw, int bh, int mi_x, int mi_y,
                          uint16_t **mc_buf, const MV mv[2],
@@ -4653,10 +4688,6 @@
     assert(inter_pred_params[ref].conv_params.is_compound == 0);
     assert(inter_pred_params[ref].conv_params.do_average == 0);
     assert(mi->interinter_comp.type == COMPOUND_AVERAGE);
-#if CONFIG_OPFL_MEMBW_REDUCTION
-    inter_pred_params[ref].use_ref_padding = 1;
-    inter_pred_params[ref].ref_area = &ref_area[ref];
-#endif  // CONFIG_OPFL_MEMBW_REDUCTION
   }
 
 #if !CONFIG_16_FULL_SEARCH_DMVR
@@ -4671,12 +4702,28 @@
 
   // If we signal the refinemv_flags we do not select sad0
   // Set sad0 a large value so that it does not be selected
+#if CONFIG_16_FULL_SEARCH_DMVR
+#if CONFIG_SUBBLK_REF_EXT
+  const int dst_stride = REFINEMV_SUBBLOCK_WIDTH +
+                         2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES);
+#else
+  const int dst_stride = REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES;
+#endif  // CONFIG_SUBBLK_REF_EXT
+  int sad0 = INT32_MAX >> 1;
+  if (!switchable_refinemv_flags) {
+    av1_refinemv_build_predictors(
+        xd, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0, dst_ref1,
+        dst_stride, center_mvs[0], center_mvs[1], inter_pred_params);
+    sad0 = get_refinemv_sad(dst_ref0, dst_ref1, dst_stride, bw, bh, xd->bd);
+  }
+#else
   int sad0 = switchable_refinemv_flags
                  ? (INT32_MAX >> 1)
                  : av1_refinemv_build_predictors_and_get_sad(
                        xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func,
                        dst_ref0, dst_ref1, center_mvs[0], center_mvs[1],
                        inter_pred_params);
+#endif  // CONFIG_16_FULL_SEARCH_DMVR
 #if !CONFIG_SUBBLK_REF_EXT
   assert(IMPLIES(mi->ref_frame[0] == TIP_FRAME, bw == 8 && bh == 8));
 #endif  // !CONFIG_SUBBLK_REF_EXT
@@ -4694,9 +4741,9 @@
   }
 
   int min_sad = sad0;
-  MV refined_mv0, refined_mv1;
-  refined_mv0 = center_mvs[0];
-  refined_mv1 = center_mvs[1];
+  MV refined_mv[2];
+  refined_mv[0] = center_mvs[0];
+  refined_mv[1] = center_mvs[1];
 
 #if CONFIG_16_FULL_SEARCH_DMVR
   static const MV neighbors[DMVR_SEARCH_NUM_NEIGHBORS] = {
@@ -4705,17 +4752,40 @@
     { -2, 0 },  { -2, -1 }, { 0, 2 }, { 0, -2 }
   };
   MV best_offset = { 0, 0 };
+  // Prediction is generated at once for (bw+4) x (bh+4) block, by extending 2
+  // samples (search range of the refinement stage) on each side. Later, the
+  // prediction buffers are appropriately offset for SAD calculation.
+  const int ext_bw = bw + 4;
+  const int ext_bh = bh + 4;
+  for (int ref = 0; ref < 2; ref++) {
+#if CONFIG_OPFL_MEMBW_REDUCTION
+    inter_pred_params[ref].use_ref_padding = 1;
+    inter_pred_params[ref].ref_area = &ref_area[ref];
+#endif  // CONFIG_OPFL_MEMBW_REDUCTION
+    inter_pred_params[ref].block_width = ext_bw;
+    inter_pred_params[ref].block_height = ext_bh;
+#if CONFIG_REFINEMV
+    inter_pred_params[ref].original_pu_width = pu_width + 4;
+    inter_pred_params[ref].original_pu_height = pu_height + 4;
+#endif  // CONFIG_REFINEMV
+    refined_mv[ref].row -= 8 * DMVR_SEARCH_EXT_LINES;
+    refined_mv[ref].col -= 8 * DMVR_SEARCH_EXT_LINES;
+  }
+
+  av1_refinemv_build_predictors(xd, mi_x, mi_y, mc_buf, calc_subpel_params_func,
+                                dst_ref0, dst_ref1, dst_stride, refined_mv[0],
+                                refined_mv[1], inter_pred_params);
+
   for (int idx = 0; idx < DMVR_SEARCH_NUM_NEIGHBORS; ++idx) {
     const MV offset = { neighbors[idx].row, neighbors[idx].col };
 
-    refined_mv0.row = center_mvs[0].row + 8 * offset.row;
-    refined_mv0.col = center_mvs[0].col + 8 * offset.col;
-    refined_mv1.row = center_mvs[1].row - 8 * offset.row;
-    refined_mv1.col = center_mvs[1].col - 8 * offset.col;
+    uint16_t *dst_ref0_offset =
+        dst_ref0 + (2 + offset.row) * dst_stride + 2 + offset.col;
+    uint16_t *dst_ref1_offset =
+        dst_ref1 + (2 - offset.row) * dst_stride + 2 - offset.col;
 
-    const int this_sad = av1_refinemv_build_predictors_and_get_sad(
-        xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0,
-        dst_ref1, refined_mv0, refined_mv1, inter_pred_params);
+    const int this_sad = get_refinemv_sad(dst_ref0_offset, dst_ref1_offset,
+                                          dst_stride, bw, bh, xd->bd);
 
     if (this_sad < min_sad) {
       min_sad = this_sad;
@@ -4729,6 +4799,7 @@
   best_mv_ref[1].col = center_mvs[1].col - 8 * best_offset.col;
 
 #else
+  (void)ref_area;
   int et_sad_th = (bw * bh) << 1;
 #if !SINGLE_STEP_SEARCH
   uint8_t already_searched[5][5];
@@ -4772,14 +4843,14 @@
     if (already_searched[offset.row + search_range][offset.col + search_range])
       continue;
 #endif
-    refined_mv0.row = center_mvs[0].row + 8 * offset.row;
-    refined_mv0.col = center_mvs[0].col + 8 * offset.col;
-    refined_mv1.row = center_mvs[1].row - 8 * offset.row;
-    refined_mv1.col = center_mvs[1].col - 8 * offset.col;
+    refined_mv[0].row = center_mvs[0].row + 8 * offset.row;
+    refined_mv[0].col = center_mvs[0].col + 8 * offset.col;
+    refined_mv[1].row = center_mvs[1].row - 8 * offset.row;
+    refined_mv[1].col = center_mvs[1].col - 8 * offset.col;
 
     int this_sad = av1_refinemv_build_predictors_and_get_sad(
         xd, bw, bh, mi_x, mi_y, mc_buf, calc_subpel_params_func, dst_ref0,
-        dst_ref1, refined_mv0, refined_mv1, inter_pred_params);
+        dst_ref1, refined_mv[0], refined_mv[1], inter_pred_params);
 
 #if !SINGLE_STEP_SEARCH
     already_searched[offset.row + search_range][offset.col + search_range] = 1;
@@ -4791,8 +4862,8 @@
       // if the SAD is less than predefined threshold consider this candidate
       // as good enough to skip rest of the search.
       if (min_sad < et_sad_th) {
-        best_mv_ref[0] = refined_mv0;
-        best_mv_ref[1] = refined_mv1;
+        best_mv_ref[0] = refined_mv[0];
+        best_mv_ref[1] = refined_mv[1];
         return;
       }
     }
@@ -5374,16 +5445,24 @@
         AOMMIN(REFINEMV_SUBBLOCK_HEIGHT >> pd->subsampling_y, bh);
 #if CONFIG_SUBBLK_REF_EXT
     uint16_t
-        dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) *
-                         (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)];
+        dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH +
+                          2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) *
+                         (REFINEMV_SUBBLOCK_HEIGHT +
+                          2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))];
     uint16_t
-        dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) *
-                         (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)];
+        dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH +
+                          2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) *
+                         (REFINEMV_SUBBLOCK_HEIGHT +
+                          2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))];
 #else
     uint16_t
-        dst0_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
+        dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) *
+                         (REFINEMV_SUBBLOCK_HEIGHT +
+                          2 * DMVR_SEARCH_EXT_LINES)];
     uint16_t
-        dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
+        dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) *
+                         (REFINEMV_SUBBLOCK_HEIGHT +
+                          2 * DMVR_SEARCH_EXT_LINES)];
 #endif  // CONFIG_SUBBLK_REF_EXT
 
     ReferenceArea ref_area[2];
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 8f93b99..a77e6ab 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -915,13 +915,23 @@
 
 #if CONFIG_REFINEMV
 // Compute the SAD between the two predictors when refinemv is ON
-int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int width, int height,
-                     int bd);
-// Genrate two prediction signals and compute SAD of a given mv0 and mv1
+int get_refinemv_sad(uint16_t *src1, uint16_t *src2, int stride, int width,
+                     int height, int bd);
+#if CONFIG_16_FULL_SEARCH_DMVR
+// Generate two prediction signals of a given mv0 and mv1
+void av1_refinemv_build_predictors(MACROBLOCKD *xd, int mi_x, int mi_y,
+                                   uint16_t **mc_buf,
+                                   CalcSubpelParamsFunc calc_subpel_params_func,
+                                   uint16_t *dst_ref0, uint16_t *dst_ref1,
+                                   int dst_stride, MV mv0, MV mv1,
+                                   InterPredParams *inter_pred_params);
+#else
+// Generate two prediction signals and compute SAD of a given mv0 and mv1
 int av1_refinemv_build_predictors_and_get_sad(
     MACROBLOCKD *xd, int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf,
     CalcSubpelParamsFunc calc_subpel_params_func, uint16_t *dst_ref0,
     uint16_t *dst_ref1, MV mv0, MV mv1, InterPredParams *inter_pred_params);
+#endif  // CONFIG_16_FULL_SEARCH_DMVR
 
 // Get the context index to code refinemv flag
 int av1_get_refinemv_context(const AV1_COMMON *cm, const MACROBLOCKD *xd,
diff --git a/av1/common/tip.c b/av1/common/tip.c
index 5f18297..ddf28bf 100644
--- a/av1/common/tip.c
+++ b/av1/common/tip.c
@@ -1194,14 +1194,22 @@
 #if CONFIG_REFINEMV
 #if CONFIG_SUBBLK_REF_EXT
   uint16_t
-      dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) *
-                       (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)];
+      dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH +
+                        2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) *
+                       (REFINEMV_SUBBLOCK_HEIGHT +
+                        2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))];
   uint16_t
-      dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * SUBBLK_REF_EXT_LINES) *
-                       (REFINEMV_SUBBLOCK_HEIGHT + 2 * SUBBLK_REF_EXT_LINES)];
+      dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH +
+                        2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES)) *
+                       (REFINEMV_SUBBLOCK_HEIGHT +
+                        2 * (SUBBLK_REF_EXT_LINES + DMVR_SEARCH_EXT_LINES))];
 #else
-  uint16_t dst0_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
-  uint16_t dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
+  uint16_t
+      dst0_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) *
+                       (REFINEMV_SUBBLOCK_HEIGHT + 2 * DMVR_SEARCH_EXT_LINES)];
+  uint16_t
+      dst1_16_refinemv[(REFINEMV_SUBBLOCK_WIDTH + 2 * DMVR_SEARCH_EXT_LINES) *
+                       (REFINEMV_SUBBLOCK_HEIGHT + 2 * DMVR_SEARCH_EXT_LINES)];
 #endif  // CONFIG_SUBBLK_REF_EXT
 #if CONFIG_TIP_LD
   const int apply_refinemv = (plane == 0 && cm->has_both_sides_refs);
diff --git a/av1/common/x86/bawp_avx2.c b/av1/common/x86/bawp_avx2.c
new file mode 100644
index 0000000..be191bc
--- /dev/null
+++ b/av1/common/x86/bawp_avx2.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * aomedia.org/license/patent-license/.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#if CONFIG_BAWP
+void av1_make_bawp_block_avx2(uint16_t *dst, int dst_stride, int16_t alpha,
+                              int32_t beta, int shift, int bw, int bh, int bd) {
+  const __m256i alpha_reg = _mm256_set1_epi32((int)alpha);
+  const __m256i beta_reg = _mm256_set1_epi32(beta);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  if (bw == 4 && ((bh & 3) == 0)) {
+    for (int j = 0; j < bh; j += 4) {
+      // d00 d01 d02 d03
+      const __m128i dst_0 = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((const __m128i *)(&dst[j * dst_stride])));
+      // d10 d11 d12 d13
+      const __m128i dst_1 = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((const __m128i *)(&dst[(j + 1) * dst_stride])));
+      // d00 d01 d02 d03 | d10 d11 d12 d13
+      const __m256i dst_01 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(dst_0), dst_1, 1);
+      // d20 d21 d22 d23
+      const __m128i dst_2 = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((const __m128i *)(&dst[(j + 2) * dst_stride])));
+      // d30 d31 d32 d33
+      const __m128i dst_3 = _mm_cvtepu16_epi32(
+          _mm_loadl_epi64((const __m128i *)(&dst[(j + 3) * dst_stride])));
+      // d20 d21 d22 d23 | d30 d31 d32 d33
+      const __m256i dst_23 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(dst_2), dst_3, 1);
+
+      const __m256i res_0 = _mm256_srai_epi32(
+          _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_01, alpha_reg)),
+          shift);
+      const __m256i res_1 = _mm256_srai_epi32(
+          _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_23, alpha_reg)),
+          shift);
+      // 00 01 02 03 | 20 21 22 23 | 10 11 12 13 | 30 31 32 33
+      const __m256i res_2 = _mm256_packus_epi32(res_0, res_1);
+      const __m256i res = _mm256_min_epu16(res_2, clip_pixel);
+      const __m128i res_lo = _mm256_castsi256_si128(res);
+      const __m128i res_hi = _mm256_extracti128_si256(res, 1);
+
+      _mm_storel_epi64((__m128i *)(&dst[j * dst_stride]), res_lo);
+      _mm_storel_epi64((__m128i *)(&dst[(j + 1) * dst_stride]), res_hi);
+      _mm_storel_epi64((__m128i *)(&dst[(j + 2) * dst_stride]),
+                       _mm_srli_si128(res_lo, 8));
+      _mm_storel_epi64((__m128i *)(&dst[(j + 3) * dst_stride]),
+                       _mm_srli_si128(res_hi, 8));
+    }
+  } else if (((bw & 7) == 0) && ((bh & 1) == 0)) {
+    for (int j = 0; j < bh; j += 2) {
+      for (int i = 0; i < bw; i += 8) {
+        // d00 d01 d02 d03 d04 d05 d06 d07
+        const __m256i dst_0 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((const __m128i *)(&dst[j * dst_stride + i])));
+        // d10 d11 d12 d13 d14 d15 d16 d17
+        const __m256i dst_1 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((const __m128i *)(&dst[(j + 1) * dst_stride + i])));
+
+        const __m256i res_0 = _mm256_srai_epi32(
+            _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_0, alpha_reg)),
+            shift);
+        const __m256i res_1 = _mm256_srai_epi32(
+            _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_1, alpha_reg)),
+            shift);
+        const __m256i res_2 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi32(res_0, res_1), 0xD8);
+        const __m256i res = _mm256_min_epu16(res_2, clip_pixel);
+
+        _mm_storeu_si128((__m128i *)(&dst[j * dst_stride + i]),
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)(&dst[(j + 1) * dst_stride + i]),
+                         _mm256_extracti128_si256(res, 1));
+      }
+    }
+  } else {
+    av1_make_bawp_block_c(dst, dst_stride, alpha, beta, shift, bw, bh, bd);
+  }
+}
+#endif  // CONFIG_BAWP
diff --git a/av1/common/x86/highbd_warp_affine_avx2.c b/av1/common/x86/highbd_warp_affine_avx2.c
index 0a8ddc2..c2c827a 100644
--- a/av1/common/x86/highbd_warp_affine_avx2.c
+++ b/av1/common/x86/highbd_warp_affine_avx2.c
@@ -33,105 +33,185 @@
 };
 #endif  // CONFIG_EXT_WARP_FILTER
 
+DECLARE_ALIGNED(32, static const uint8_t, warp_highbd_shuffle_pattern[32]) = {
+  0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
+  0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, warp_highbd_arrange_bytes[32]) = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
 DECLARE_ALIGNED(32, static const uint8_t, shuffle_input_mask[32]) = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
 };
 
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_gamma0_mask0[32]) = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
-                                              3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-                                              2, 3, 0, 1, 2, 3, 0, 1, 2, 3 };
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask0[32]) = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
 
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_gamma0_mask1[32]) = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6,
-                                              7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5,
-                                              6, 7, 4, 5, 6, 7, 4, 5, 6, 7 };
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask1[32]) = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
 
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2[32]) = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask2[32]) = {
   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
   8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
 };
 
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3[32]) = {
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_gamma0_mask3[32]) = {
   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
   12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
 };
 
-static INLINE void prepare_vertical_filter_coeffs_avx2(int sy, int gamma,
-                                                       __m256i *coeffs) {
-  // A7 A6 A5 A4 A3 A2 A1 A0
-  __m256i v_coeff01 = _mm256_castsi128_si256(_mm_loadu_si128(
-      (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]));
-  // B7 B6 B5 B4 B3 B2 B1 B0 | A7 A6 A5 A4 A3 A2 A1 A0
-  v_coeff01 = _mm256_inserti128_si256(
-      v_coeff01,
+static INLINE void prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(
+    int32_t s, __m256i *coeffs) {
+  // c0 c1 c2 c3 c4 c5 c6 c7 | c0 c1 c2 c3 c4 c5 c6 c7
+  const __m256i v_coeff = _mm256_broadcastsi128_si256(_mm_loadu_si128(
+      (__m128i *)av1_warped_filter[(s) >> WARPEDDIFF_PREC_BITS]));
+
+  coeffs[0] = _mm256_shuffle_epi8(
+      v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask0));
+  coeffs[1] = _mm256_shuffle_epi8(
+      v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask1));
+  coeffs[2] = _mm256_shuffle_epi8(
+      v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask2));
+  coeffs[3] = _mm256_shuffle_epi8(
+      v_coeff, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask3));
+}
+
+static INLINE void prepare_8tap_filter_coeffs_avx2(int32_t s, int offset,
+                                                   __m256i *coeffs) {
+  // c00 c01 c02 c03 c04 c05 c06 c07 | x x x x x x x x
+  const __m256i v_coeff0 = _mm256_castsi128_si256(_mm_loadu_si128(
+      (__m128i *)av1_warped_filter[(s) >> WARPEDDIFF_PREC_BITS]));
+  // c00 c01 c02 c03 c04 c05 c06 c07 | c10 c11 c12 c13 c14 c15 c16 c17
+  const __m256i v_coeff01 = _mm256_inserti128_si256(
+      v_coeff0,
       _mm_loadu_si128(
-          (__m128i *)av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
+          (__m128i *)av1_warped_filter[(s + offset) >> WARPEDDIFF_PREC_BITS]),
       1);
-  // C7 C6 C5 C4 C3 C2 C1 C0
-  __m256i v_coeff23 = _mm256_castsi128_si256(_mm_loadu_si128(
-      (__m128i *)av1_warped_filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS]));
-  // D7 D6 D5 D4 D3 D2 D1 D0 | C7 C6 C5 C4 C3 C2 C1 C0
-  v_coeff23 = _mm256_inserti128_si256(
-      v_coeff23,
+  // c20 c21 c22 c23 c24 c25 c26 c27 | x x x x x x x x
+  const __m256i v_coeff2 = _mm256_castsi128_si256(_mm_loadu_si128(
+      (__m128i *)av1_warped_filter[(s + 2 * offset) >> WARPEDDIFF_PREC_BITS]));
+  // c20 c21 c22 c23 c24 c25 c26 c27 | c30 c31 c32 c33 c34 c35 c36 c37
+  const __m256i v_coeff23 = _mm256_inserti128_si256(
+      v_coeff2,
       _mm_loadu_si128(
           (__m128i *)
-              av1_warped_filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS]),
+              av1_warped_filter[(s + 3 * offset) >> WARPEDDIFF_PREC_BITS]),
       1);
-  // E7 E6 E5 E4 E3 E2 E1 E0
-  __m256i v_coeff45 = _mm256_castsi128_si256(_mm_loadu_si128(
-      (__m128i *)av1_warped_filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS]));
-  // F7 F6 F5 F4 F3 F2 F1 F0 | E7 E6 E5 E4 E3 E2 E1 E0
-  v_coeff45 = _mm256_inserti128_si256(
-      v_coeff45,
+  // c40 c41 c42 c43 c44 c45 c46 c47 | x x x x x x x x
+  const __m256i v_coeff4 = _mm256_castsi128_si256(_mm_loadu_si128(
+      (__m128i *)av1_warped_filter[(s + 4 * offset) >> WARPEDDIFF_PREC_BITS]));
+  // c40 c41 c42 c43 c44 c45 c46 c47 | c50 c51 c52 c53 c54 c55 c56 c57
+  const __m256i v_coeff45 = _mm256_inserti128_si256(
+      v_coeff4,
       _mm_loadu_si128(
           (__m128i *)
-              av1_warped_filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS]),
+              av1_warped_filter[(s + 5 * offset) >> WARPEDDIFF_PREC_BITS]),
       1);
-  // G7 G6 G5 G4 G3 G2 G1 G0
-  __m256i v_coeff67 = _mm256_castsi128_si256(_mm_loadu_si128(
-      (__m128i *)av1_warped_filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS]));
-  // H7 H6 H5 H4 H3 H2 H1 H0 | G7 G6 G5 G4 G3 G2 G1 G0
-  v_coeff67 = _mm256_inserti128_si256(
-      v_coeff67,
+  // c60 c61 c62 c63 c64 c65 c66 c67 | x x x x x x x x
+  const __m256i v_coeff6 = _mm256_castsi128_si256(_mm_loadu_si128(
+      (__m128i *)av1_warped_filter[(s + 6 * offset) >> WARPEDDIFF_PREC_BITS]));
+  // c60 c61 c62 c63 c64 c65 c66 c67 | c70 c71 c72 c73 c74 c75 c76 c77
+  const __m256i v_coeff67 = _mm256_inserti128_si256(
+      v_coeff6,
       _mm_loadu_si128(
           (__m128i *)
-              av1_warped_filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS]),
+              av1_warped_filter[(s + 7 * offset) >> WARPEDDIFF_PREC_BITS]),
       1);
-  // D3 D2 B3 B2 D1 D0 B1 B0 | C3 C2 A3 A2 C1 C0 A1 A0
+
   __m256i v_c0123 = _mm256_unpacklo_epi32(v_coeff01, v_coeff23);
-  // D7 D6 B7 B6 D5 D4 B5 B4 | C7 C6 A7 A6 C5 C4 A5 A4
   __m256i v_c0123u = _mm256_unpackhi_epi32(v_coeff01, v_coeff23);
-  // H3 H2 F3 F2 H1 H0 F1 F0 | G3 G2 E3 E2 G1 G0 E1 E0
   __m256i v_c4567 = _mm256_unpacklo_epi32(v_coeff45, v_coeff67);
-  // H7 H6 F7 F6 H5 H4 F5 F4 | G7 G6 E7 E6 G5 G4 E5 E4
   __m256i v_c4567u = _mm256_unpackhi_epi32(v_coeff45, v_coeff67);
 
+  // c00 c01 c20 c21 c40 c41 c60 c61 | c10 c11 c30 c31 c50 c51 c70 c71
   coeffs[0] = _mm256_unpacklo_epi64(v_c0123, v_c4567);
+  // c02 c03 c22 c23 c06 c07 c26 c27 | c12 c13 c32 c33 c16 c17 c72 c73
   coeffs[1] = _mm256_unpackhi_epi64(v_c0123, v_c4567);
+  // c04 c05 c24 c25 c44 c45 c64 c65 | c14 c15 c34 c35 c54 c55 c74 c75
   coeffs[2] = _mm256_unpacklo_epi64(v_c0123u, v_c4567u);
+  // c06 c07 c26 c27 c46 c47 c66 c67 | c16 c17 c36 c37 c56 c57 c76 c77
   coeffs[3] = _mm256_unpackhi_epi64(v_c0123u, v_c4567u);
 }
 
-static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int32_t sy,
-                                                              __m256i *coeffs) {
-  __m256i v_coeff = _mm256_castsi128_si256(_mm_loadu_si128(
-      (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]));
-  v_coeff =
-      _mm256_inserti128_si256(v_coeff, _mm256_castsi256_si128(v_coeff), 1);
-
-  coeffs[0] = _mm256_shuffle_epi8(
-      v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0));
-  coeffs[1] = _mm256_shuffle_epi8(
-      v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1));
-  coeffs[2] = _mm256_shuffle_epi8(
-      v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2));
-  coeffs[3] = _mm256_shuffle_epi8(
-      v_coeff, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3));
+static INLINE void load_horiz_src_pixels_avx2(const uint16_t *ref, __m256i *r) {
+  r[0] = _mm256_loadu_si256((__m256i *)ref);
+  r[1] = _mm256_loadu_si256((__m256i *)(ref + 1));
 }
 
-static INLINE void prepare_input_data(__m256i *input, __m256i *src) {
+static INLINE void prepare_8tap_horiz_src_padded_avx2(const uint16_t *ref,
+                                                      int out_of_boundary_left,
+                                                      int out_of_boundary_right,
+                                                      __m256i *src_padded) {
+  const __m256i src_0 = _mm256_loadu_si256((__m256i *)ref);
+
+  const __m256i src_01 = _mm256_shuffle_epi8(
+      src_0, _mm256_loadu_si256((__m256i *)warp_highbd_arrange_bytes));
+  __m256i src_reg = _mm256_permute4x64_epi64(src_01, 0xD8);
+
+  if (out_of_boundary_left >= 0) {
+    const __m128i shuffle_left =
+        _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+    const __m256i shuffle_reg_left = _mm256_broadcastsi128_si256(shuffle_left);
+    src_reg = _mm256_shuffle_epi8(src_reg, shuffle_reg_left);
+  }
+
+  if (out_of_boundary_right >= 0) {
+    const __m128i shuffle_right =
+        _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+    const __m256i shuffle_reg_right =
+        _mm256_broadcastsi128_si256(shuffle_right);
+    src_reg = _mm256_shuffle_epi8(src_reg, shuffle_reg_right);
+  }
+
+  src_padded[0] = _mm256_shuffle_epi8(
+      _mm256_permute4x64_epi64(src_reg, 0xD8),
+      _mm256_loadu_si256((__m256i *)warp_highbd_shuffle_pattern));
+  __m256i src_padded_hi = _mm256_permute4x64_epi64(src_padded[0], 0xEE);
+  src_padded[1] = _mm256_alignr_epi8(src_padded_hi, src_padded[0], 2);
+}
+
+static INLINE void prepare_8tap_horiz_src_avx2(const __m256i *input,
+                                               __m256i *src) {
+  // r0 r1 r2 r3 r4 r5 r6 r7 | r1 r2 r3 r4 r5 r6 r7 r8
+  const __m256i r_low = _mm256_permute2x128_si256(input[0], input[1], 0x20);
+  // r8 r9 r10 r11 r12 r13 r14 r15 | r9 r10 r11 r12 r13 r14 r15 r16
+  const __m256i r_high = _mm256_permute2x128_si256(input[0], input[1], 0x31);
+
+  // r0 r1 r2 r3 r4 r5 r6 r7 | r1 r2 r3 r4 r5 r6 r7 r8
+  src[0] = r_low;
+  // r2 r3 r4 r5 r6 r7 r8 r9 | r3 r4 r5 r6 r7 r8 r9 r10
+  src[1] = _mm256_alignr_epi8(r_high, r_low, 4);
+  // r4 r5 r6 r7 r8 r9 r10 r11 | r5 r6 r7 r8 r9 r10 r11 r12
+  src[2] = _mm256_alignr_epi8(r_high, r_low, 8);
+  // r6 r7 r8 r9 r10 r11 r12 r13 | r7 r8 r9 r10 r11 r12 r13 r14
+  src[3] = _mm256_alignr_epi8(r_high, r_low, 12);
+}
+
+static INLINE void filter_src_pixels_horiz_avx2(const __m256i *in,
+                                                const __m256i *coeffs,
+                                                const __m256i *offset,
+                                                int shift, __m256i *out) {
+  const __m256i res_0 = _mm256_madd_epi16(in[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(in[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(in[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(in[3], coeffs[3]);
+
+  const __m256i res_4 = _mm256_add_epi32(
+      res_0, _mm256_add_epi32(_mm256_add_epi32(res_2, res_3), res_1));
+  const __m256i res = _mm256_add_epi32(res_4, *offset);
+  *out = _mm256_srai_epi32(res, shift);
+}
+
+static INLINE void prepare_8tap_vert_src_avx2(const __m256i *input,
+                                              __m256i *src) {
   __m256i input_01 = _mm256_packus_epi32(input[0], input[1]);
   __m256i input_23 = _mm256_packus_epi32(input[2], input[3]);
   __m256i input_45 = _mm256_packus_epi32(input[4], input[5]);
@@ -189,11 +269,10 @@
 static INLINE void store_vertical_filter_output_avx2(
     uint16_t *pred, int p_stride, ConvolveParams *conv_params, int bd,
     const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
-    const __m256i *reduce_bits_vert_const,
-    const __m128i *reduce_bits_vert_shift, const int use_wtd_comp_avg,
+    const __m256i *reduce_bits_vert_const, const int use_wtd_comp_avg,
     const __m256i *wt0, const __m256i *wt1, const __m256i *res_sub_const,
-    const __m256i *round_bits_const, __m128i *round_bits_shift, int i, int j,
-    int k, const int reduce_bits_vert) {
+    const __m256i *round_bits_const, int i, int j, int k,
+    const int reduce_bits_vert, const int round_bits) {
   const __m256i clip_pixel =
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   __m256i v_sum = *res_lo;
@@ -207,12 +286,11 @@
         (__m128i *)&conv_params->dst[(i + k + 5) * conv_params->dst_stride + j];
 
     v_sum = _mm256_add_epi32(v_sum, *res_add_const);
-    v_sum = _mm256_sra_epi32(_mm256_add_epi32(v_sum, *reduce_bits_vert_const),
-                             *reduce_bits_vert_shift);
+    v_sum = _mm256_srai_epi32(_mm256_add_epi32(v_sum, *reduce_bits_vert_const),
+                              reduce_bits_vert);
     v_sum_r1 = _mm256_add_epi32(v_sum_r1, *res_add_const);
-    v_sum_r1 =
-        _mm256_sra_epi32(_mm256_add_epi32(v_sum_r1, *reduce_bits_vert_const),
-                         *reduce_bits_vert_shift);
+    v_sum_r1 = _mm256_srai_epi32(
+        _mm256_add_epi32(v_sum_r1, *reduce_bits_vert_const), reduce_bits_vert);
     if (conv_params->do_average) {
       __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
       __m128i *const dst16_r1 = (__m128i *)&pred[(i + k + 5) * p_stride + j];
@@ -236,11 +314,11 @@
       }
 
       __m256i v_sum1 = _mm256_add_epi32(v_sum, *res_sub_const);
-      v_sum1 = _mm256_sra_epi32(_mm256_add_epi32(v_sum1, *round_bits_const),
-                                *round_bits_shift);
+      v_sum1 = _mm256_srai_epi32(_mm256_add_epi32(v_sum1, *round_bits_const),
+                                 round_bits);
       __m256i v_sum1_r1 = _mm256_add_epi32(v_sum_r1, *res_sub_const);
-      v_sum1_r1 = _mm256_sra_epi32(
-          _mm256_add_epi32(v_sum1_r1, *round_bits_const), *round_bits_shift);
+      v_sum1_r1 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_sum1_r1, *round_bits_const), round_bits);
 
       __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1_r1);
       v_sum16 =
@@ -317,14 +395,12 @@
   // into an unsigned 16-bit intermediate array.
   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
 
-  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
   const __m256i reduce_bits_vert_const =
       _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
   const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
   const __m256i res_sub_const =
       _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
                         (1 << (offset_bits - conv_params->round_1 - 1)));
-  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
   __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
 
   const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
@@ -334,9 +410,10 @@
   const __m256i wt1 = _mm256_set1_epi32(w1);
 
   __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
-  __m256i v_zeros = _mm256_setzero_si256();
   int ohoriz = 1 << offset_bits_horiz;
   int mhoriz = 1 << max_bits_horiz;
+  const __m256i v_offset_bits_horiz = _mm256_set1_epi32(ohoriz);
+  const __m256i offset = _mm256_add_epi32(v_offset_bits_horiz, v_rbhoriz);
   (void)mhoriz;
   int sx;
 
@@ -355,9 +432,9 @@
       const int64_t x4 = dst_x >> subsampling_x;
       const int64_t y4 = dst_y >> subsampling_y;
 
-      const int16_t ix4 = (int16_t)(x4 >> WARPEDMODEL_PREC_BITS);
+      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      const int16_t iy4 = (int16_t)(y4 >> WARPEDMODEL_PREC_BITS);
+      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
 
 #if CONFIG_RELAX_AFFINE_CONSTRAINTS
@@ -424,18 +501,70 @@
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
         }
 #if CONFIG_OPFL_MEMBW_REDUCTION
-      } else if (((ix4 - 7) < left_limit) || ((ix4 + 8) > right_limit)) {
+      } else if (((ix4 - 7) < left_limit) || ((ix4 + 7) > right_limit)) {
+        const int out_of_boundary_left = left_limit - (ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 7) - right_limit;
+        __m256i src_padded[2], coeffs[4], src[4];
+        if (alpha == 0 && beta == 0) {
+          prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx4, &coeffs[0]);
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            iy = clamp(iy, top_limit, bottom_limit);
+
+            prepare_8tap_horiz_src_padded_avx2(
+                &ref[iy * stride + ix4 - 7], out_of_boundary_left,
+                out_of_boundary_right, &src_padded[0]);
+            prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
+          }
+        } else if (alpha == 0) {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            iy = clamp(iy, top_limit, bottom_limit);
+
+            sx = sx4 + beta * (k + 4);
+            prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx, coeffs);
+            prepare_8tap_horiz_src_padded_avx2(
+                &ref[iy * stride + ix4 - 7], out_of_boundary_left,
+                out_of_boundary_right, &src_padded[0]);
+            prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
+          }
+        } else if (beta == 0) {
+          prepare_8tap_filter_coeffs_avx2(sx4, alpha, &coeffs[0]);
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            iy = clamp(iy, top_limit, bottom_limit);
+
+            prepare_8tap_horiz_src_padded_avx2(
+                &ref[iy * stride + ix4 - 7], out_of_boundary_left,
+                out_of_boundary_right, &src_padded[0]);
+            prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
+          }
+        } else {
+          for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+            int iy = iy4 + k;
+            iy = clamp(iy, top_limit, bottom_limit);
+
+            sx = sx4 + beta * (k + 4);
+            prepare_8tap_filter_coeffs_avx2(sx, alpha, &coeffs[0]);
+            prepare_8tap_horiz_src_padded_avx2(
+                &ref[iy * stride + ix4 - 7], out_of_boundary_left,
+                out_of_boundary_right, &src_padded[0]);
+            prepare_8tap_horiz_src_avx2(&src_padded[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
+          }
+        }
 #else
       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
-#endif  // CONFIG_OPFL_MEMBW_REDUCTION
         int32_t tmp1[8];
         for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
-#if CONFIG_OPFL_MEMBW_REDUCTION
-          const int iy = clamp(iy4 + k, top_limit, bottom_limit);
-#else
           const int iy = clamp(iy4 + k, 0, height - 1);
-#endif  // CONFIG_OPFL_MEMBW_REDUCTION
-
           sx = sx4 + beta * (k + 4);
           for (int l = -4; l < 4; ++l) {
             int ix = ix4 + l - 3;
@@ -444,11 +573,7 @@
 
             int32_t sum = 1 << offset_bits_horiz;
             for (int m = 0; m < 8; ++m) {
-#if CONFIG_OPFL_MEMBW_REDUCTION
-              const int sample_x = clamp(ix + m, left_limit, right_limit);
-#else
               const int sample_x = clamp(ix + m, 0, width - 1);
-#endif  // CONFIG_OPFL_MEMBW_REDUCTION
               sum += ref[iy * stride + sample_x] * coeffs[m];
             }
             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
@@ -457,20 +582,11 @@
           }
           tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
         }
+#endif  // CONFIG_OPFL_MEMBW_REDUCTION
       } else {
-        if (beta == 0 && alpha == 0) {
-          sx = sx4;
-          __m128i v_01 = _mm_loadu_si128(
-              (__m128i *)
-                  av1_warped_filter[sx >>
-                                    WARPEDDIFF_PREC_BITS]);  // A7A6A5A4A3A2A1A0
-          __m256i v_c01 = _mm256_broadcastd_epi32(v_01);     // A1A0A1A0A1A0A1A0
-          __m256i v_c23 = _mm256_broadcastd_epi32(
-              _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
-          __m256i v_c45 = _mm256_broadcastd_epi32(
-              _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
-          __m256i v_c67 = _mm256_broadcastd_epi32(
-              _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
+        __m256i r[2], coeffs[4], src[4];
+        if (alpha == 0 && beta == 0) {
+          prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx4, &coeffs[0]);
           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
             int iy = iy4 + k;
 #if CONFIG_OPFL_MEMBW_REDUCTION
@@ -482,44 +598,10 @@
               iy = height - 1;
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
             iy = iy * stride;
-
-            __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
-                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
-                1);  // R15 .. R0
-
-            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
-
-            __m256i v_refu =
-                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
-            v_refu = _mm256_inserti128_si256(
-                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
-
-            __m256i v_sum = _mm256_set1_epi32(ohoriz);
-            __m256i parsum = _mm256_madd_epi16(
-                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
-                                          0));  // R8R7R6..R1R7R6R5..R1R0
-            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
-
-            parsum = _mm256_madd_epi16(
-                v_c23,
-                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
-            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
-            parsum = _mm256_madd_epi16(
-                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
-                                          8));  // R12R11..R5R11R10..R5R4
-            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
-            parsum = _mm256_madd_epi16(
-                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
-                                          12));  // R14R13..R7R13R12..R7R6
-            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
-
-            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
-                                           reduce_bits_horiz);
+            load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]);
+            prepare_8tap_horiz_src_avx2(&r[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
           }
         } else if (alpha == 0) {
           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -535,125 +617,15 @@
             iy = iy * stride;
 
             sx = sx4 + beta * (k + 4);
-
-            __m128i v_01 = _mm_loadu_si128(
-                (__m128i *)av1_warped_filter
-                    [sx >> WARPEDDIFF_PREC_BITS]);          // A7A6A5A4A3A2A1A0
-            __m256i v_c01 = _mm256_broadcastd_epi32(v_01);  // A1A0A1A0A1A0A1A0
-            __m256i v_c23 = _mm256_broadcastd_epi32(
-                _mm_shuffle_epi32(v_01, 1));  // A3A2A3A2A3A2A3A2
-            __m256i v_c45 = _mm256_broadcastd_epi32(
-                _mm_shuffle_epi32(v_01, 2));  // A5A4A5A4A5A4A5A4
-            __m256i v_c67 = _mm256_broadcastd_epi32(
-                _mm_shuffle_epi32(v_01, 3));  // A7A6A7A6A7A6A7A6
-
-            __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
-                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
-                1);  // R15 .. R0
-
-            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
-
-            __m256i v_refu =
-                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
-
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
-            v_refu = _mm256_inserti128_si256(
-                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
-
-            __m256i v_sum = _mm256_set1_epi32(ohoriz);
-            __m256i parsum =
-                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
-            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
-
-            parsum =
-                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
-            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
-            parsum =
-                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
-            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
-            parsum = _mm256_madd_epi16(v_c67,
-                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
-            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
-
-            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
-                                           reduce_bits_horiz);
+            prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sx, coeffs);
+            load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]);
+            prepare_8tap_horiz_src_avx2(&r[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
           }
+
         } else if (beta == 0) {
-          sx = sx4;
-          __m256i v_coeff01 = _mm256_inserti128_si256(
-              v_zeros,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
-              0);
-          v_coeff01 = _mm256_inserti128_si256(
-              v_coeff01,
-              _mm_loadu_si128(
-                  (__m128i *)
-                      av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
-              1);  // B7B6..B1B0A7A6..A1A0
-          __m256i v_coeff23 = _mm256_inserti128_si256(
-              v_zeros,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              0);
-          v_coeff23 = _mm256_inserti128_si256(
-              v_coeff23,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              1);  // D7D6..D1D0C7C6..C1C0
-          __m256i v_coeff45 = _mm256_inserti128_si256(
-              v_zeros,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              0);
-          v_coeff45 = _mm256_inserti128_si256(
-              v_coeff45,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              1);  // F7F6..F1F0E7E6..E1E0
-          __m256i v_coeff67 = _mm256_inserti128_si256(
-              v_zeros,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              0);
-          v_coeff67 = _mm256_inserti128_si256(
-              v_coeff67,
-              _mm_loadu_si128(
-                  (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
-                                               WARPEDDIFF_PREC_BITS]),
-              1);  // H7H6..H1H0G7G6..G1G0
-
-          __m256i v_c0123 = _mm256_unpacklo_epi32(
-              v_coeff01,
-              v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
-          __m256i v_c0123u = _mm256_unpackhi_epi32(
-              v_coeff01,
-              v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
-          __m256i v_c4567 = _mm256_unpacklo_epi32(
-              v_coeff45,
-              v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
-          __m256i v_c4567u = _mm256_unpackhi_epi32(
-              v_coeff45,
-              v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
-
-          __m256i v_c01 = _mm256_unpacklo_epi64(
-              v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
-          __m256i v_c23 =
-              _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
-          __m256i v_c45 =
-              _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
-          __m256i v_c67 =
-              _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
-
+          prepare_8tap_filter_coeffs_avx2(sx4, alpha, &coeffs[0]);
           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
             int iy = iy4 + k;
 #if CONFIG_OPFL_MEMBW_REDUCTION
@@ -665,47 +637,11 @@
               iy = height - 1;
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
             iy = iy * stride;
-
-            __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
-                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
-                1);  // R15 .. R0
-
-            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
-
-            __m256i v_refu =
-                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
-
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
-            v_refu = _mm256_inserti128_si256(
-                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
-
-            __m256i v_sum = _mm256_set1_epi32(ohoriz);
-            __m256i parsum = _mm256_madd_epi16(
-                v_c01, _mm256_alignr_epi8(v_refu, v_refl,
-                                          0));  // R8R7R6..R1R7R6R5..R1R0
-            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
-
-            parsum = _mm256_madd_epi16(
-                v_c23,
-                _mm256_alignr_epi8(v_refu, v_refl, 4));  // R10R9..R3R9R8..R3R2
-            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
-            parsum = _mm256_madd_epi16(
-                v_c45, _mm256_alignr_epi8(v_refu, v_refl,
-                                          8));  // R12R11..R5R11R10..R5R4
-            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
-            parsum = _mm256_madd_epi16(
-                v_c67, _mm256_alignr_epi8(v_refu, v_refl,
-                                          12));  // R14R13..R7R13R12..R7R6
-            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
-
-            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
-                                           reduce_bits_horiz);
+            load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]);
+            prepare_8tap_horiz_src_avx2(&r[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
           }
-
         } else {
           for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
             int iy = iy4 + k;
@@ -720,111 +656,11 @@
             iy = iy * stride;
 
             sx = sx4 + beta * (k + 4);
-
-            __m256i v_coeff01 = _mm256_inserti128_si256(
-                v_zeros,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
-                0);
-            v_coeff01 = _mm256_inserti128_si256(
-                v_coeff01,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                1);  // B7B6..B1B0A7A6..A1A0
-            __m256i v_coeff23 = _mm256_inserti128_si256(
-                v_zeros,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                0);
-            v_coeff23 = _mm256_inserti128_si256(
-                v_coeff23,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                1);  // D7D6..D1D0C7C6..C1C0
-            __m256i v_coeff45 = _mm256_inserti128_si256(
-                v_zeros,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                0);
-            v_coeff45 = _mm256_inserti128_si256(
-                v_coeff45,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                1);  // F7F6..F1F0E7E6..E1E0
-            __m256i v_coeff67 = _mm256_inserti128_si256(
-                v_zeros,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                0);
-            v_coeff67 = _mm256_inserti128_si256(
-                v_coeff67,
-                _mm_loadu_si128(
-                    (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
-                                                 WARPEDDIFF_PREC_BITS]),
-                1);  // H7H6..H1H0G7G6..G1G0
-
-            __m256i v_c0123 = _mm256_unpacklo_epi32(
-                v_coeff01,
-                v_coeff23);  // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
-            __m256i v_c0123u = _mm256_unpackhi_epi32(
-                v_coeff01,
-                v_coeff23);  // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
-            __m256i v_c4567 = _mm256_unpacklo_epi32(
-                v_coeff45,
-                v_coeff67);  // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
-            __m256i v_c4567u = _mm256_unpackhi_epi32(
-                v_coeff45,
-                v_coeff67);  // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
-
-            __m256i v_c01 = _mm256_unpacklo_epi64(
-                v_c0123, v_c4567);  // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
-            __m256i v_c23 =
-                _mm256_unpackhi_epi64(v_c0123, v_c4567);  // H3H2 ... A3A2
-            __m256i v_c45 =
-                _mm256_unpacklo_epi64(v_c0123u, v_c4567u);  // H5H4 ... A5A4
-            __m256i v_c67 =
-                _mm256_unpackhi_epi64(v_c0123u, v_c4567u);  // H7H6 ... A7A6
-
-            __m256i v_refl = _mm256_inserti128_si256(
-                _mm256_set1_epi16(0),
-                _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
-                1);  // R15 .. R0
-
-            __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
-
-            __m256i v_refu =
-                _mm256_alignr_epi8(v_ref, v_refl, 2);  // R8R15R14...R2R1
-
-            v_refl = _mm256_inserti128_si256(
-                v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
-            v_refu = _mm256_inserti128_si256(
-                v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
-
-            __m256i v_sum = _mm256_set1_epi32(ohoriz);
-            __m256i parsum =
-                _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
-            __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
-
-            parsum =
-                _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
-            __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
-            parsum =
-                _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
-            __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
-            parsum = _mm256_madd_epi16(v_c67,
-                                       _mm256_alignr_epi8(v_refu, v_refl, 12));
-            __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
-
-            tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
-                                           reduce_bits_horiz);
+            prepare_8tap_filter_coeffs_avx2(sx, alpha, &coeffs[0]);
+            load_horiz_src_pixels_avx2(&ref[iy + ix4 - 7], &r[0]);
+            prepare_8tap_horiz_src_avx2(&r[0], &src[0]);
+            filter_src_pixels_horiz_avx2(src, coeffs, &offset,
+                                         reduce_bits_horiz, &tmp[k + 7]);
           }
         }
       }
@@ -833,13 +669,13 @@
       if (gamma == 0 && delta == 0) {
         __m256i coeffs[8], src[8];
 
-        prepare_vertical_filter_coeffs_gamma0_avx2(sy4, coeffs);
+        prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy4, coeffs);
         coeffs[4] = coeffs[0];
         coeffs[5] = coeffs[1];
         coeffs[6] = coeffs[2];
         coeffs[7] = coeffs[3];
 
-        prepare_input_data(tmp, src);
+        prepare_8tap_vert_src_avx2(tmp, src);
 
         for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
           __m256i v_sum_r0, v_sum_r1;
@@ -848,41 +684,41 @@
 
           store_vertical_filter_output_avx2(
               pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1,
-              &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift,
-              use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const,
-              &round_bits_shift, i, j, k, reduce_bits_vert);
+              &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0,
+              &wt1, &res_sub_const, &round_bits_const, i, j, k,
+              reduce_bits_vert, round_bits);
         }
       } else if (gamma == 0) {
         __m256i coeffs[8], src[8];
-        prepare_input_data(tmp, src);
+        prepare_8tap_vert_src_avx2(tmp, src);
 
         for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
           __m256i v_sum_r0, v_sum_r1;
           int sy_0 = sy4 + delta * (k + 4);
-          prepare_vertical_filter_coeffs_gamma0_avx2(sy_0, &coeffs[0]);
+          prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy_0, &coeffs[0]);
 
           int sy_1 = sy4 + delta * (k + 5);
-          prepare_vertical_filter_coeffs_gamma0_avx2(sy_1, &coeffs[4]);
+          prepare_8tap_filter_coeffs_alpha0_gamma0_avx2(sy_1, &coeffs[4]);
 
           filter_src_pixels_vertical_avx2(tmp, src, coeffs, &v_sum_r0,
                                           &v_sum_r1, k);
 
           store_vertical_filter_output_avx2(
               pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1,
-              &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift,
-              use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const,
-              &round_bits_shift, i, j, k, reduce_bits_vert);
+              &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0,
+              &wt1, &res_sub_const, &round_bits_const, i, j, k,
+              reduce_bits_vert, round_bits);
         }
       } else if (delta == 0) {
         __m256i coeffs[8], src[8];
-        prepare_vertical_filter_coeffs_avx2(sy4, gamma, &coeffs[0]);
+        prepare_8tap_filter_coeffs_avx2(sy4, gamma, &coeffs[0]);
 
         coeffs[4] = coeffs[0];
         coeffs[5] = coeffs[1];
         coeffs[6] = coeffs[2];
         coeffs[7] = coeffs[3];
 
-        prepare_input_data(tmp, src);
+        prepare_8tap_vert_src_avx2(tmp, src);
 
         for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
           __m256i v_sum_r0, v_sum_r1;
@@ -891,31 +727,31 @@
 
           store_vertical_filter_output_avx2(
               pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1,
-              &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift,
-              use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const,
-              &round_bits_shift, i, j, k, reduce_bits_vert);
+              &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0,
+              &wt1, &res_sub_const, &round_bits_const, i, j, k,
+              reduce_bits_vert, round_bits);
         }
       } else {
         __m256i src[8];
-        prepare_input_data(tmp, src);
+        prepare_8tap_vert_src_avx2(tmp, src);
 
         for (int k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
           __m256i coeffs[8];
           __m256i v_sum_r0, v_sum_r1;
           int sy_0 = sy4 + delta * (k + 4);
-          prepare_vertical_filter_coeffs_avx2(sy_0, gamma, &coeffs[0]);
+          prepare_8tap_filter_coeffs_avx2(sy_0, gamma, &coeffs[0]);
 
           int sy_1 = sy4 + delta * (k + 5);
-          prepare_vertical_filter_coeffs_avx2(sy_1, gamma, &coeffs[4]);
+          prepare_8tap_filter_coeffs_avx2(sy_1, gamma, &coeffs[4]);
 
           filter_src_pixels_vertical_avx2(tmp, src, coeffs, &v_sum_r0,
                                           &v_sum_r1, k);
 
           store_vertical_filter_output_avx2(
               pred, p_stride, conv_params, bd, &v_sum_r0, &v_sum_r1,
-              &res_add_const, &reduce_bits_vert_const, &reduce_bits_vert_shift,
-              use_wtd_comp_avg, &wt0, &wt1, &res_sub_const, &round_bits_const,
-              &round_bits_shift, i, j, k, reduce_bits_vert);
+              &res_add_const, &reduce_bits_vert_const, use_wtd_comp_avg, &wt0,
+              &wt1, &res_sub_const, &round_bits_const, i, j, k,
+              reduce_bits_vert, round_bits);
         }
       }
     }
@@ -934,13 +770,13 @@
 
   // c0 c1 c0 c1 c0 c1 c0 c1 | f0 f1 f0 f1 f0 f1 f0 f1
   coeff[0] = _mm256_shuffle_epi8(
-      filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0));
+      filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask0));
   // c2 c3 c2 c3 c2 c3 c2 c3 | f2 f3 f2 f3 f2 f3 f2 f3
   coeff[1] = _mm256_shuffle_epi8(
-      filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1));
+      filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask1));
   // c4 c5 c4 c5 c4 c5 c4 c5 | f4 f5 f4 f5 f4 f5 f4 f5
   coeff[2] = _mm256_shuffle_epi8(
-      filt, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2));
+      filt, _mm256_load_si256((__m256i *)shuffle_alpha0_gamma0_mask2));
 }
 
 static INLINE void ext_highbd_warp_horizontal_filter_avx2(
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 28921b5..3a473fd 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -51,6 +51,8 @@
     if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
   }
 
+  // TODO(any): Consider tools like OPFL, DMVR in the match criteria.
+
   // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD
   for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_WEDGE;
        comp_type++) {
@@ -1464,9 +1466,14 @@
   int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX };
   int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX };
   int match_index = 0;
+  const int reuse_compound_type_data =
+      cpi->sf.inter_sf.reuse_compound_type_data;
   const int match_found =
-      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
-                            comp_model_dist, comp_rs2, &match_index);
+      reuse_compound_type_data
+          ? find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist,
+                                  comp_model_rate, comp_model_dist, comp_rs2,
+                                  &match_index)
+          : 0;
   best_mv[0].as_int = cur_mv[0].as_int;
   best_mv[1].as_int = cur_mv[1].as_int;
   *rd = INT64_MAX;
@@ -1513,7 +1520,7 @@
 #if CONFIG_REFINEMV
       (!mbmi->refinemv_flag || !switchable_refinemv_flag(cm, mbmi)) &&
 #endif  // CONFIG_REFINEMV
-      cpi->sf.inter_sf.reuse_compound_type_decision) {
+      (reuse_compound_type_data >= 2)) {
     return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
                                          comp_rate, comp_dist, comp_rs2,
                                          rate_mv, rd, match_index);
@@ -1667,7 +1674,7 @@
     }
   }
   restore_dst_buf(xd, *orig_dst, 1);
-  if (!match_found)
+  if (!match_found && reuse_compound_type_data)
     save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
                              comp_model_dist, cur_mv, comp_rs2);
   return best_type_stats.best_compmode_interinter_cost;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index a6dc591..06e37da 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -11538,6 +11538,7 @@
   if (search_state.best_skip2 == 0) {
     const int try_intrabc = cpi->oxcf.kf_cfg.enable_intrabc &&
                             cpi->oxcf.kf_cfg.enable_intrabc_ext &&
+                            !sf->inter_sf.skip_eval_intrabc_in_inter_frame &&
                             av1_allow_intrabc(cm, xd
 #if CONFIG_ENABLE_IBC_NAT
                                               ,
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 20eeceb..35501ed 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -359,6 +359,10 @@
   sf->inter_sf.reduce_inter_modes = 1;
   sf->inter_sf.selective_ref_frame = 1;
   sf->inter_sf.skip_mode_eval_based_on_rate_cost = 1;
+  sf->inter_sf.skip_eval_intrabc_in_inter_frame =
+      cm->features.allow_screen_content_tools ? 0
+      : cm->current_frame.pyramid_level < 3   ? 0
+                                              : 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
   sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
@@ -366,7 +370,8 @@
   sf->intra_sf.reuse_uv_mode_rd_info = true;
 #endif  // CONFIG_AIMC
 
-  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.adaptive_tx_type_search_idx = 1;
+  sf->tx_sf.adaptive_tx_partition_type_search_idx = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
   sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.prune_tx_rd_eval_sec_tx_sse = true;
@@ -390,7 +395,8 @@
   if (speed >= 1) {
     sf->inter_sf.selective_ref_frame = 2;
 
-    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.adaptive_tx_type_search_idx = 4;
+    sf->tx_sf.adaptive_tx_partition_type_search_idx = 4;
 
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
   }
@@ -435,7 +441,8 @@
     sf->intra_sf.skip_intra_dip_search = true;
 #endif  // CONFIG_DIP
 
-    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.adaptive_tx_type_search_idx = 4;
+    sf->tx_sf.adaptive_tx_partition_type_search_idx = 4;
     sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
     sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
     sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
@@ -512,7 +519,9 @@
     sf->inter_sf.selective_ref_frame = 4;
     sf->inter_sf.skip_repeated_ref_mv = 1;
     sf->inter_sf.skip_repeated_full_newmv = 1;
-    sf->inter_sf.reuse_compound_type_decision = 1;
+    // TODO(any): Set this speed feature to 2 after correcting the match
+    // criteria by considering tools like OPFL, DMVR.
+    sf->inter_sf.reuse_compound_type_data = 0;
     sf->inter_sf.txfm_rd_gate_level =
         boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
 
@@ -530,7 +539,8 @@
 
     sf->tx_sf.tx_type_search.skip_stx_search = 1;
     sf->tx_sf.tx_type_search.skip_cctx_search = 1;
-    sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+    sf->tx_sf.adaptive_tx_type_search_idx = boosted ? 4 : 5;
+    sf->tx_sf.adaptive_tx_partition_type_search_idx = boosted ? 4 : 5;
     sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
 
     // TODO(any): Refactor the code related to following winner mode speed
@@ -814,6 +824,7 @@
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
   inter_sf->skip_repeated_newmv = 0;
+  inter_sf->skip_eval_intrabc_in_inter_frame = 0;
   inter_sf->skip_repeated_full_newmv = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
@@ -835,7 +846,7 @@
   inter_sf->disable_interinter_wedge = 0;
   inter_sf->prune_ref_mv_idx_search = 0;
   inter_sf->prune_warped_prob_thresh = 0;
-  inter_sf->reuse_compound_type_decision = 0;
+  inter_sf->reuse_compound_type_data = 0;
   inter_sf->txfm_rd_gate_level = 0;
   inter_sf->prune_inter_modes_if_skippable = 0;
   inter_sf->disable_masked_comp = 0;
@@ -892,7 +903,8 @@
   tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
   tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
-  tx_sf->adaptive_txb_search_level = 0;
+  tx_sf->adaptive_tx_type_search_idx = 0;
+  tx_sf->adaptive_tx_partition_type_search_idx = 0;
   tx_sf->use_intra_txb_hash = 0;
   tx_sf->use_inter_txb_hash = 1;
   tx_sf->refine_fast_tx_search_results = 1;
@@ -1359,9 +1371,17 @@
 
   if (cpi->oxcf.mode == GOOD && speed == 0) {
     const int qindex_thresh = 124 + qindex_offset;
+    const int qindex_thresh2 = 135 + qindex_offset;
     if (cm->quant_params.base_qindex <= qindex_thresh) {
-      sf->tx_sf.adaptive_txb_search_level =
+      sf->tx_sf.adaptive_tx_type_search_idx =
           (boosted || cm->features.allow_screen_content_tools) ? 1 : 2;
+      sf->tx_sf.adaptive_tx_partition_type_search_idx =
+          (boosted || cm->features.allow_screen_content_tools) ? 1 : 2;
+    } else if (cm->quant_params.base_qindex <= qindex_thresh2) {
+      sf->tx_sf.adaptive_tx_partition_type_search_idx =
+          (boosted || cm->features.allow_screen_content_tools) ? 1 : 3;
+      sf->tx_sf.adaptive_tx_type_search_idx =
+          (boosted || cm->features.allow_screen_content_tools) ? 1 : 3;
     }
   }
 
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 491af36..ecb5882 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -662,6 +662,9 @@
   // flag to skip NEWMV mode in drl if the motion search result is the same
   int skip_repeated_newmv;
 
+  // flag to skip the evaulation of intrabc mode in inter frame
+  int skip_eval_intrabc_in_inter_frame;
+
   // flag to early terminate jmvd scaling factors
   int early_terminate_jmvd_scale_factor;
 
@@ -803,8 +806,9 @@
 
   // Reuse compound type rd decision when exact match is found
   // 0: No reuse
-  // 1: Reuse the compound type decision
-  int reuse_compound_type_decision;
+  // 1: Reuse the compound type rd data
+  // 2: Reuse the compound type decision
+  int reuse_compound_type_data;
 
   // Enable/disable masked compound.
   int disable_masked_comp;
@@ -890,11 +894,15 @@
   // is selected as all zero coefficients.
   int txb_split_cap;
 
-  // Shortcut the transform block partition and type search when the target
-  // rdcost is relatively lower.
-  // Values are 0 (not used) , or 1 - 2 with progressively increasing
-  // aggressiveness
-  int adaptive_txb_search_level;
+  // Prune transform type evaluation when target rdcost is low as
+  // compared to best rdcost and based on eob.
+  // 0: no pruning
+  // 1,4,5: pruning based on best rd
+  // 2,3: pruning based on eob and best rd
+  int adaptive_tx_type_search_idx;
+  // Prune transform partition type evaluation when target rdcost is low as
+  // compared to TX_PARTITION_NONE and based on the transform size.
+  int adaptive_tx_partition_type_search_idx;
 
   // Prune level for tx_size_type search for inter based on rd model
   // 0: no pruning
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 875461f..5a5efdb 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -127,6 +127,16 @@
 #endif  // CONFIG_EXT_RECUR_PARTITIONS
 };
 
+// look-up table of transform partition type pruning level used to prune the
+// evaluation of transform partition type based on none rd.
+static const int tx_partition_prune_level[2][6] = { { 0, 1, 3, 3, 2, 3 },
+                                                    { 0, 1, 2, 1, 2, 3 } };
+
+// look-up table of transform type pruning level used to prune the evaluation of
+// transform type based on best rd and eob.
+static const int tx_type_prune_level[2][6] = { { 0, 1, 2, 1, 2, 3 },
+                                               { 0, 1, 3, 3, 2, 3 } };
+
 static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
                                 const uint32_t hash) {
   // Linear search through the circular buffer to find matching hash.
@@ -2867,6 +2877,7 @@
       xd, plane, blk_col, blk_row, txw, txh, cm->width, cm->height, NULL, NULL);
 #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE
 
+  const int max_eob = av1_get_max_eob(tx_size);
   // Iterate through all transform type candidates.
   for (int idx = 0; idx < TX_TYPES; ++idx) {
 #if CONFIG_TX_TYPE_FLEX_IMPROVE
@@ -3257,14 +3268,17 @@
         }
 #endif  // COLLECT_TX_SIZE_DATA
 
-        // If the current best RD cost is much worse than the reference RD cost,
-        // terminate early.
-        if (cpi->sf.tx_sf.adaptive_txb_search_level) {
-          if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
-              ref_best_rd) {
-            skip_idx = true;
-            break;
-          }
+        assert(cpi->sf.tx_sf.adaptive_tx_type_search_idx < 6);
+        // Terminate the search early, If the best rd is higher than the
+        // reference best rd and number of coded coefficients are smaller
+        // than a threshold.
+        const int search_level =
+            tx_type_prune_level[p->eobs[block] < max_eob / 8]
+                               [cpi->sf.tx_sf.adaptive_tx_type_search_idx];
+        if (search_level &&
+            (best_rd - (best_rd >> search_level)) > ref_best_rd) {
+          skip_idx = true;
+          break;
         }
 
         // Terminate transform type search if the block has been quantized to
@@ -3820,6 +3834,7 @@
   const int txw = tx_size_wide[max_tx_size];
   const int txh = tx_size_high[max_tx_size];
   const int is_vert_rect = (txh > txw);
+  const int max_txw_txh = AOMMAX(txw, txh);
   assert(max_tx_size < TX_SIZES_ALL);
   TX_SIZE sub_txs[MAX_TX_PARTITIONS] = { 0 };
 
@@ -3982,12 +3997,13 @@
         if (p->eobs[block] == 0) break;
       }
 
-      const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
-      if (search_level) {
-        if ((tmp_rd - (tmp_rd >> search_level)) > ref_best_rd) {
-          *is_cost_valid = 0;
-          break;
-        }
+      const int search_level =
+          tx_partition_prune_level[max_txw_txh == 64]
+                                  [cpi->sf.tx_sf
+                                       .adaptive_tx_partition_type_search_idx];
+      if (search_level && (tmp_rd - (tmp_rd >> search_level)) > ref_best_rd) {
+        *is_cost_valid = 0;
+        break;
       }
     }
   }
@@ -4061,8 +4077,11 @@
                           plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
                           ftxs_mode, rd_info_node, &no_split);
 
+    assert(cpi->sf.tx_sf.adaptive_tx_partition_type_search_idx < 6);
     // Speed features for early termination.
-    const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+    const int search_level =
+        tx_partition_prune_level[1][cpi->sf.tx_sf
+                                        .adaptive_tx_partition_type_search_idx];
     if (search_level) {
       if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
         *is_cost_valid = 0;
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 257c873..16c7310 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -116,6 +116,7 @@
       sizes.insert(BlockSize(w / 2, h / 2));
     }
   }
+  sizes.insert(BlockSize(24, 24));
   std::vector<TestParam<T>> result;
   for (const BlockSize &block : sizes) {
     for (int bd : bit_depths) {
@@ -142,7 +143,7 @@
 TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
   auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
 #if CONFIG_EXT_RECUR_PARTITIONS
-  ASSERT_EQ(80U, v.size());
+  ASSERT_EQ(82U, v.size());
 #else
   ASSERT_EQ(60U, v.size());
 #endif  // CONFIG_EXT_RECUR_PARTITIONS
diff --git a/test/bawp_test.cc b/test/bawp_test.cc
new file mode 100644
index 0000000..26cd971
--- /dev/null
+++ b/test/bawp_test.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * aomedia.org/license/patent-license/.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#if CONFIG_BAWP
+namespace {
+typedef void (*make_bawp_func)(uint16_t *dst, int dst_stride, int16_t alpha,
+                               int32_t beta, int shift, int bw, int bh, int bd);
+#if HAVE_AVX2
+const BLOCK_SIZE kValidBlockSize[] = {
+  BLOCK_4X4,     BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
+  BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,   BLOCK_16X32,
+  BLOCK_32X16,   BLOCK_32X32,   BLOCK_32X64,   BLOCK_64X32,
+  BLOCK_64X64,   BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
+#if CONFIG_EXT_RECUR_PARTITIONS
+  BLOCK_128X256, BLOCK_256X128, BLOCK_256X256,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+  BLOCK_4X16,    BLOCK_16X4,    BLOCK_8X32,    BLOCK_32X8,
+  BLOCK_16X64,   BLOCK_64X16,
+#if CONFIG_EXT_RECUR_PARTITIONS
+  BLOCK_4X32,    BLOCK_32X4,    BLOCK_8X64,    BLOCK_64X8,
+  BLOCK_4X64,    BLOCK_64X4,
+#endif  // CONFIG_EXT_RECUR_PARTITIONS
+};
+#endif  // HAVE_AVX2
+
+typedef std::tuple<make_bawp_func, BLOCK_SIZE> BAWPParam;
+
+class BAWPTest : public ::testing::TestWithParam<BAWPParam> {
+ public:
+  ~BAWPTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(make_bawp_func test_impl, BLOCK_SIZE bsize);
+  void RunSpeedTest(make_bawp_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (pred1_[idx] != pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y);
+          printf("%d != %d ", pred1_[idx], pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint16_t *pred1_;
+  uint16_t *pred2_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BAWPTest);
+
+BAWPTest::~BAWPTest() {}
+
+void BAWPTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+
+  pred1_ = (uint16_t *)aom_memalign(
+      16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t));
+  ASSERT_NE(pred1_, nullptr);
+  pred2_ = (uint16_t *)aom_memalign(
+      16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t));
+  ASSERT_NE(pred2_, nullptr);
+  for (int i = 0; i < (MAX_SB_SIZE * MAX_SB_SIZE); ++i) {
+    pred1_[i] = rnd_.Rand16();
+    pred2_[i] = pred1_[i];
+  }
+}
+
+void BAWPTest::TearDown() {
+  aom_free(pred1_);
+  aom_free(pred2_);
+}
+
+void BAWPTest::RunCheckOutput(make_bawp_func test_impl, BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int16_t alpha = 320;
+  const int32_t beta = -42036;
+  const int shift = 8;
+  int bd[3] = { 8, 10, 12 };
+  for (int i = 0; i < 3; ++i) {
+    av1_make_bawp_block_c(pred1_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd[i]);
+    test_impl(pred2_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd[i]);
+
+    ASSERT_EQ(CheckResult(w, h), true);
+  }
+}
+
+void BAWPTest::RunSpeedTest(make_bawp_func test_impl, BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int num_loops = 1000000000 / (w + h);
+  const int16_t alpha = 320;
+  const int32_t beta = -42036;
+  const int shift = 8;
+  int bd = 8;
+
+  make_bawp_func functions[2] = { av1_make_bawp_block_c, test_impl };
+  double elapsed_time[2] = { 0.0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    make_bawp_func func = functions[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(pred1_, MAX_SB_SIZE, alpha, beta, shift, w, h, bd);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time;
+  }
+  printf("BAWP %3dx%-3d: c_time=%7.2fs, simd_time=%7.2fs, scaling=%3.2f\n", w,
+         h, elapsed_time[0], elapsed_time[1],
+         elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(BAWPTest, CheckOutput) { RunCheckOutput(GET_PARAM(0), GET_PARAM(1)); }
+
+TEST_P(BAWPTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0), GET_PARAM(1)); }
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BAWPTest,
+    ::testing::Combine(::testing::Values(&av1_make_bawp_block_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif  // HAVE_AVX2
+}  // namespace
+#endif  // CONFIG_BAWP
diff --git a/test/test.cmake b/test/test.cmake
index 33fb16d..c5fe3ac 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -90,6 +90,7 @@
     APPEND
     AOM_UNIT_TEST_COMMON_SOURCES
     "${AOM_ROOT}/test/av1_common_int_test.cc"
+    "${AOM_ROOT}/test/bawp_test.cc"
     "${AOM_ROOT}/test/cdef_test.cc"
     "${AOM_ROOT}/test/cfl_test.cc"
     "${AOM_ROOT}/test/convolve_test.cc"
diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 3cdd921..03497e3 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc
@@ -60,11 +60,11 @@
 #endif  // CONFIG_EXT_WARP_FILTER
 
 #if HAVE_AVX2
-#if !CONFIG_OPFL_MEMBW_REDUCTION
+#if CONFIG_OPFL_MEMBW_REDUCTION
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdWarpFilterTest,
     libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
-#endif  // !CONFIG_OPFL_MEMBW_REDUCTION
+#endif  // CONFIG_OPFL_MEMBW_REDUCTION
 #endif  // HAVE_AVX2
 
 }  // namespace
diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index eac44b6..712d9a4 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc
@@ -18,8 +18,8 @@
 namespace libaom_test {
 
 int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
-  // 1 in 8 chance of generating zero (arbitrarily chosen)
-  if (((rnd->Rand8()) & 7) == 0) return 0;
+  // 1 in 32 chance of generating zero (arbitrarily chosen)
+  if (((rnd->Rand8()) & 0x1f) == 0) return 0;
   // Otherwise, enerate uniform values in the range
   // [-(1 << bits), 1] U [1, 1<<bits]
   int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
@@ -40,6 +40,8 @@
              (1 << WARPEDMODEL_PREC_BITS);
     mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
 
+    if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+    if (is_beta_zero == 1) mat[3] = 0;
     if (rnd8 <= 1) {
       // AFFINE
       mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
@@ -52,14 +54,12 @@
       mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
       mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
                (1 << WARPEDMODEL_PREC_BITS);
-      if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
-      if (is_beta_zero == 1) mat[3] = 0;
-      if (is_gamma_zero == 1) mat[4] = 0;
-      if (is_delta_zero == 1)
-        mat[5] = static_cast<int32_t>(
-            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
-            (1 << WARPEDMODEL_PREC_BITS));
     }
+    if (is_gamma_zero == 1) mat[4] = 0;
+    if (is_delta_zero == 1)
+      mat[5] = static_cast<int32_t>(
+          ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+          (1 << WARPEDMODEL_PREC_BITS));
 
     // Calculate the derived parameters and check that they are suitable
     // for the warp filter.
@@ -98,6 +98,31 @@
     return;
   }
 }
+#if CONFIG_OPFL_MEMBW_REDUCTION
+void generate_ref_area_limits(libaom_test::ACMRandom *rnd,
+                              ReferenceArea *ref_area, int w, int h, int out_w,
+                              int out_h, int p_row, int p_col, int *mat,
+                              int use_ref_area_pad) {
+  int left_limit;
+  if (use_ref_area_pad) {
+    const int32_t src_x = (p_col + 4);
+    const int32_t src_y = (p_row + 4);
+    const int64_t dst_x =
+        (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+    const int32_t ix4 = (int32_t)(dst_x >> WARPEDMODEL_PREC_BITS);
+    left_limit = ix4 - 7 + 3;
+  } else {
+    left_limit = rnd->Rand8() % (w - 1);
+  }
+  ref_area->pad_block.x0 = left_limit;
+  ref_area->pad_block.x1 = ref_area->pad_block.x0 + out_w + 7;
+  ref_area->pad_block.y0 = rnd->Rand8() % (h - 1);
+  ref_area->pad_block.y1 = ref_area->pad_block.y0 + out_h + 7;
+
+  ref_area->pad_block.x1 = CLIP(ref_area->pad_block.x1, 1, w);
+  ref_area->pad_block.y1 = CLIP(ref_area->pad_block.y1, 1, h);
+}
+#endif
 
 namespace AV1HighbdWarpFilter {
 ::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
@@ -136,7 +161,7 @@
   const int out_w = std::get<0>(param), out_h = std::get<1>(param);
   const int bd = std::get<3>(param);
   const int mask = (1 << bd) - 1;
-  int sub_x, sub_y;
+  int sub_x, sub_y, p_row, p_col;
 
   // The warp functions always write rows with widths that are multiples of 8.
   // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
@@ -164,6 +189,8 @@
 
   sub_x = 0;
   sub_y = 0;
+  p_row = 32;
+  p_col = 32;
   int do_average = 0;
   conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
 
@@ -172,8 +199,8 @@
   aom_usec_timer_start(&timer);
 
   for (int i = 0; i < num_loops; ++i)
-    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
-              sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta
+    test_impl(mat, input, w, h, stride, output, p_col, p_row, out_w, out_h,
+              out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta
 #if CONFIG_OPFL_MEMBW_REDUCTION
               ,
               0, NULL
@@ -181,9 +208,27 @@
     );
 
   aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
   printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
-         1000.0 * elapsed_time / num_loops);
+         1000.0 * elapsed_time1 / num_loops);
+
+#if CONFIG_OPFL_MEMBW_REDUCTION
+  ReferenceArea ref_area;
+  generate_ref_area_limits(&rnd_, &ref_area, w, h, out_w, out_h, p_row, p_col,
+                           mat, 1);
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, p_col, p_row, out_w, out_h,
+              out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta,
+              1, &ref_area);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time2 = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("highbd warp using ref area padding %3dx%-3d: %7.2f ns\n", out_w,
+         out_h, 1000.0 * elapsed_time2 / num_loops);
+#endif
 
   delete[] input_;
   delete[] output;
@@ -193,6 +238,7 @@
 void AV1HighbdWarpFilterTest::RunCheckOutput(
     highbd_warp_affine_func test_impl) {
   const int w = 128, h = 128;
+  const int p_row = 32, p_col = 32;
   const int border = 16;
   const int stride = w + 2 * border;
   HighbdWarpTestParam param = GET_PARAM(0);
@@ -218,6 +264,9 @@
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
   CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
   CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+#if CONFIG_OPFL_MEMBW_REDUCTION
+  ReferenceArea ref_area;
+#endif
   for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
 
   for (i = 0; i < num_iters; ++i) {
@@ -230,6 +279,11 @@
         input[r * stride + w + c] = input[r * stride + (w - 1)];
       }
     }
+#if CONFIG_OPFL_MEMBW_REDUCTION
+    int use_damr_padding = i % 2 == 0;
+    generate_ref_area_limits(&rnd_, &ref_area, w, h, out_w, out_h, p_row, p_col,
+                             NULL, 0);
+#endif
     const int use_no_round = rnd_.Rand8() & 1;
     for (sub_x = 0; sub_x < 2; ++sub_x)
       for (sub_y = 0; sub_y < 2; ++sub_y) {
@@ -251,12 +305,13 @@
                 conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
 
-              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
-                                       out_w, out_h, out_w, sub_x, sub_y, bd,
-                                       &conv_params, alpha, beta, gamma, delta
+              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, p_col,
+                                       p_row, out_w, out_h, out_w, sub_x, sub_y,
+                                       bd, &conv_params, alpha, beta, gamma,
+                                       delta
 #if CONFIG_OPFL_MEMBW_REDUCTION
                                        ,
-                                       0, NULL
+                                       use_damr_padding, &ref_area
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
               );
               if (use_no_round) {
@@ -270,12 +325,12 @@
                 conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
                 conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
-              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                        out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
-                        gamma, delta
+              test_impl(mat, input, w, h, stride, output2, p_col, p_row, out_w,
+                        out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha,
+                        beta, gamma, delta
 #if CONFIG_OPFL_MEMBW_REDUCTION
                         ,
-                        0, NULL
+                        use_damr_padding, &ref_area
 #endif  // CONFIG_OPFL_MEMBW_REDUCTION
               );
 
diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index a9d77ed..ae1af23 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h
@@ -34,6 +34,12 @@
                            int16_t *alpha, int16_t *beta, int16_t *gamma,
                            int16_t *delta, int is_alpha_zero, int is_beta_zero,
                            int is_gamma_zero, int is_delta_zero);
+#if CONFIG_OPFL_MEMBW_REDUCTION
+void generate_ref_area_limits(libaom_test::ACMRandom *rnd,
+                              ReferenceArea *ref_area, int w, int h, int out_w,
+                              int out_h, int p_row, int p_col, int *mat,
+                              int use_ref_area_pad);
+#endif
 
 namespace AV1WarpFilter {