CWG-E191 for alt-v1-anchor: Filling residual pixels outside the actual frame border

STATS_CHANGED
diff --git a/aom_mem/aom_mem.c b/aom_mem/aom_mem.c
index e977b01..78444c2 100644
--- a/aom_mem/aom_mem.c
+++ b/aom_mem/aom_mem.c
@@ -82,3 +82,12 @@
   for (i = 0; i < length; i++) *dest16++ = val;
   return dest;
 }
+
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+void *aom_memset_int16(void *dest, int16_t val, size_t length) {
+  size_t i;
+  int16_t *dest16 = (int16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
diff --git a/aom_mem/aom_mem.h b/aom_mem/aom_mem.h
index bc5d8bc..1b6855f 100644
--- a/aom_mem/aom_mem.h
+++ b/aom_mem/aom_mem.h
@@ -37,6 +37,9 @@
 void *aom_calloc(size_t num, size_t size);
 void aom_free(void *memblk);
 void *aom_memset16(void *dest, int val, size_t length);
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+void *aom_memset_int16(void *dest, int16_t val, size_t length);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
 /*returns an addr aligned to the byte boundary specified by align*/
 #define aom_align_addr(addr, align) \
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index 9e3d424..7ce170e 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -62,6 +62,10 @@
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
 #define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+#define DIVIDE_AND_ROUND_SIGNED(n, d) \
+  ((((n) < 0) ^ ((d) < 0)) ? (((n) - (d) / 2) / (d)) : (((n) + (d) / 2) / (d)))
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index f33094d..ba820ea 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -475,7 +475,9 @@
                                    RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   if (ref_best_rd < 0) return INT64_MAX;
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   av1_subtract_plane(x, bs, 0);
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   x->rd_model = LOW_TXFM_RD;
   const int skip_trellis = (cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
                             NO_ESTIMATE_YRD_TRELLIS_OPT);
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 1a4bddf..28587b0 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -34,23 +34,158 @@
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+#include "av1/encoder/rdopt_utils.h"
+
+// Compute the average value of the wxh block.
+static AOM_INLINE int16_t avg_wxh_block_c(int16_t *diff, ptrdiff_t diff_stride,
+                                          int w, int h) {
+  int32_t sum = 0;
+  for (int row = 0; row < h; ++row) {
+    for (int col = 0; col < w; ++col) {
+      sum += *(diff + row * diff_stride + col);
+    }
+  }
+  return (int16_t)(DIVIDE_AND_ROUND_SIGNED(sum, w * h));
+}
+
+// Compute the row average value of the wxh block.
+static AOM_INLINE void avg_wxh_block_horiz_c(int16_t *diff,
+                                             ptrdiff_t diff_stride, int w,
+                                             int h, int16_t *out) {
+  for (int row = 0; row < h; ++row) {
+    int32_t sum = 0;
+    for (int col = 0; col < w; ++col) {
+      sum += *(diff + row * diff_stride + col);
+    }
+    out[row] = (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, w);
+  }
+}
+
+// Compute the column average value of the wxh block.
+static AOM_INLINE void avg_wxh_block_vert_c(int16_t *diff,
+                                            ptrdiff_t diff_stride, int w, int h,
+                                            int16_t *out) {
+  for (int col = 0; col < w; ++col) {
+    int32_t sum = 0;
+    for (int row = 0; row < h; ++row) {
+      sum += *(diff + row * diff_stride + col);
+    }
+    out[col] = (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, h);
+  }
+}
+
+// Fill the outside-frame part's residues with values derived from the in-frame
+// part's residues.
+static AOM_INLINE void fill_residue_outside_frame(
+    int16_t *diff, ptrdiff_t diff_stride, int tx_cols, int tx_rows,
+    int visible_tx_cols, int visible_tx_rows, TX_TYPE tx_type) {
+  const int complete_block_outside =
+      (visible_tx_cols == 0 || visible_tx_rows == 0);
+
+  if (tx_type <= IDTX) {
+    int16_t avg = 0;
+    if (tx_type != IDTX && !complete_block_outside)
+      avg =
+          avg_wxh_block_c(diff, diff_stride, visible_tx_cols, visible_tx_rows);
+
+    // Fill the remaining parts of the block with the average value
+    const int right_pixels = tx_cols - visible_tx_cols;
+    for (int i = 0; i < tx_rows; ++i) {
+      aom_memset_int16(diff + i * diff_stride + visible_tx_cols, avg,
+                       right_pixels);
+    }
+
+    for (int i = visible_tx_rows; i < tx_rows; ++i) {
+      aom_memset_int16(diff + i * diff_stride, avg, visible_tx_cols);
+    }
+  } else if (htx_tab[tx_type] == IDTX_1D) {
+    if (visible_tx_rows < tx_rows) {
+      int16_t out[64] = { 0 };
+      if (!complete_block_outside)
+        avg_wxh_block_vert_c(diff, diff_stride, visible_tx_cols,
+                             visible_tx_rows, out);
+
+      for (int j = 0; j < visible_tx_cols; j++) {
+        for (int i = visible_tx_rows; i < tx_rows; ++i) {
+          *(diff + i * diff_stride + j) = out[j];
+        }
+      }
+    }
+
+    const int right_pixels = tx_cols - visible_tx_cols;
+    if (right_pixels) {
+      for (int i = 0; i < tx_rows; ++i) {
+        memset(diff + i * diff_stride + visible_tx_cols, 0,
+               right_pixels * sizeof(*diff));
+      }
+    }
+  } else {
+    assert(vtx_tab[tx_type] == IDTX_1D);
+
+    const int right_pixels = tx_cols - visible_tx_cols;
+    if (right_pixels) {
+      int16_t out[64] = { 0 };
+      if (!complete_block_outside)
+        avg_wxh_block_horiz_c(diff, diff_stride, visible_tx_cols,
+                              visible_tx_rows, out);
+
+      for (int i = 0; i < visible_tx_rows; ++i) {
+        aom_memset_int16(diff + i * diff_stride + visible_tx_cols, out[i],
+                         right_pixels);
+      }
+    }
+
+    for (int i = visible_tx_rows; i < tx_rows; ++i) {
+      memset(diff + i * diff_stride, 0, tx_cols * sizeof(*diff));
+    }
+  }
+}
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
 void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                         int16_t *diff, ptrdiff_t diff_stride,
                         const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+                        const uint8_t *pred8, ptrdiff_t pred_stride
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                        ,
+                        int plane, int blk_col, int blk_row, int frame_width,
+                        int frame_height, TX_TYPE tx_type
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+) {
   assert(rows >= 4 && cols >= 4);
   if (is_cur_buf_hbd(xd)) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride, xd->bd);
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     return;
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   }
-  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
-                     pred_stride);
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  else {
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                       pred_stride);
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  }
+
+  int visible_cols, visible_rows;
+  const int is_border_block = get_visible_dimensions(
+      xd, plane, blk_col, blk_row, cols, rows, frame_width, frame_height,
+      &visible_cols, &visible_rows);
+  if (is_border_block)
+    fill_residue_outside_frame(diff, diff_stride, cols, rows, visible_cols,
+                               visible_rows, tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 }
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
-                      int blk_col, int blk_row, TX_SIZE tx_size) {
+                      int blk_col, int blk_row, TX_SIZE tx_size
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                      ,
+                      int frame_width, int frame_height, TX_TYPE tx_type
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -64,10 +199,20 @@
   int16_t *src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
   av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
-                     src_stride, dst, dst_stride);
+                     src_stride, dst, dst_stride
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                     ,
+                     plane, blk_col, blk_row, frame_width, frame_height, tx_type
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  );
 }
 
-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                        ,
+                        int frame_width, int frame_height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   assert(plane_bsize < BLOCK_SIZES_ALL);
@@ -76,7 +221,12 @@
   const MACROBLOCKD *xd = &x->e_mbd;
 
   av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                     pd->dst.buf, pd->dst.stride);
+                     pd->dst.buf, pd->dst.stride
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                     ,
+                     plane, 0, 0, frame_width, frame_height, DCT_DCT
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  );
 }
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
@@ -392,6 +542,10 @@
       !mbmi->skip_mode) {
     tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                     cm->width, cm->height, tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     TxfmParam txfm_param;
     QUANT_PARAM quant_param;
     const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
@@ -620,7 +774,12 @@
 
 void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
   encode_block_pass1_args args = { cpi, x };
-  av1_subtract_plane(x, bsize, 0);
+  av1_subtract_plane(x, bsize, 0
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                     ,
+                     cpi->common.width, cpi->common.height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  );
   av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
                                          encode_block_pass1, &args);
 }
@@ -675,7 +834,9 @@
     const int step =
         tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
     av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     av1_subtract_plane(x, plane_bsize, plane);
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
     const BLOCK_SIZE max_unit_bsize =
@@ -756,12 +917,19 @@
     }
 #endif
   } else {
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
-
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     const ENTROPY_CONTEXT *a = &args->ta[blk_col];
     const ENTROPY_CONTEXT *l = &args->tl[blk_row];
     tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    TX_TYPE primary_tx_type = is_stat_generation_stage(cpi) ? DCT_DCT : tx_type;
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                     cm->width, cm->height, primary_tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
     TxfmParam txfm_param;
     QUANT_PARAM quant_param;
     const int use_trellis =
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index fcd34a3..c8df446 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -127,12 +127,28 @@
 void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
                         int16_t *diff, ptrdiff_t diff_stride,
                         const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride);
+                        const uint8_t *pred8, ptrdiff_t pred_stride
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                        ,
+                        int plane, int blk_col, int blk_row, int frame_width,
+                        int frame_height, TX_TYPE tx_type
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
-                      int blk_col, int blk_row, TX_SIZE tx_size);
+                      int blk_col, int blk_row, TX_SIZE tx_size
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                      ,
+                      int frame_width, int frame_height, TX_TYPE tx_type
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+);
 
-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                        ,
+                        int frame_width, int frame_height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+);
 
 static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
                                        TX_SIZE tx_size, ENTROPY_CONTEXT *a,
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index d4d6e4d..67dd5d6 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -242,10 +242,22 @@
     int rate;
     int bw, bh;
     const struct macroblock_plane *const p = &x->plane[plane];
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    const AV1_COMMON *const cm = &cpi->common;
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_height = block_size_high[plane_bsize];
+    get_visible_dimensions(xd, plane, 0, 0, block_width, block_height,
+                           cm->width, cm->height, &bw, &bh);
+    sse = pixel_dist_visible_only(cpi, x, p->src.buf, p->src.stride,
+                                  pd->dst.buf, pd->dst.stride, plane_bsize,
+                                  block_height, block_width, bh, bw);
+#else
     get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
                        &bw, &bh);
 
     sse = calculate_sse(xd, p, pd, bw, bh);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
     model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
                           &dist);
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 693bdba..00ec8d0 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1697,7 +1697,12 @@
     const int bw = block_size_wide[plane_bsize];
     const int bh = block_size_high[plane_bsize];
 
-    av1_subtract_plane(x, plane_bsize, plane);
+    av1_subtract_plane(x, plane_bsize, plane
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                       ,
+                       cm->width, cm->height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    );
     int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
     total_sse += sse;
   }
@@ -3533,7 +3538,12 @@
         if (mbmi->motion_mode == OBMC_CAUSAL)
           av1_build_obmc_inter_predictors_sb(cm, xd);
 
-        av1_subtract_plane(x, bsize, 0);
+        av1_subtract_plane(x, bsize, 0
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                           ,
+                           cm->width, cm->height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+        );
         if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
             !xd->lossless[mbmi->segment_id]) {
           av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index b61f7f5..a6b3815 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -322,6 +322,74 @@
   if (width) *width = txb_width;
 }
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+static AOM_INLINE int get_visible_dimensions(const MACROBLOCKD *xd, int plane,
+                                             int blk_col, int blk_row, int cols,
+                                             int rows, int frame_width,
+                                             int frame_height,
+                                             int *visible_cols,
+                                             int *visible_rows) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int luma_bw = xd->plane[0].width;
+  const int luma_bh = xd->plane[0].height;
+
+  const int col_start = (luma_bw == 4) && ss_x ? xd->mi_col - 1 : xd->mi_col;
+  const int row_start = (luma_bh == 4) && ss_y ? xd->mi_row - 1 : xd->mi_row;
+  const int x = (col_start << MI_SIZE_LOG2) >> ss_x;
+  const int y = (row_start << MI_SIZE_LOG2) >> ss_y;
+
+  const int mi_x = x + (blk_col << MI_SIZE_LOG2);
+  const int mi_y = y + (blk_row << MI_SIZE_LOG2);
+  const int plane_frame_width = frame_width >> ss_x;
+  const int plane_frame_height = frame_height >> ss_y;
+  int valid_cols, valid_rows;
+
+  if (mi_x + cols <= plane_frame_width) {
+    valid_cols = cols;
+  } else {
+    valid_cols = clamp(plane_frame_width - mi_x, 0, cols);
+  }
+
+  if (mi_y + rows <= plane_frame_height) {
+    valid_rows = rows;
+  } else {
+    valid_rows = clamp(plane_frame_height - mi_y, 0, rows);
+  }
+
+  if (visible_cols != NULL && visible_rows != NULL) {
+    *visible_cols = valid_cols;
+    *visible_rows = valid_rows;
+  }
+  return (valid_cols < cols || valid_rows < rows);
+}
+
+static AOM_INLINE unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
 static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
   return num_blk;
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index fd9c242..63ba81b 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -79,6 +79,22 @@
   av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
 }
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+static AOM_INLINE void tpl_subtract_block(
+    const MACROBLOCKD *xd, int rows, int cols, int16_t *diff,
+    ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride,
+    const uint8_t *pred8, ptrdiff_t pred_stride) {
+  assert(rows >= 4 && cols >= 4);
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride, xd->bd);
+    return;
+  }
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
 static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
@@ -88,8 +104,13 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   const int pix_num = bw * bh;
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  tpl_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+#else
   av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
                      dst_stride);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
   return aom_satd(coeff, pix_num);
 }
@@ -116,8 +137,14 @@
     int *rate_cost, int64_t *recon_error, int64_t *sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
   uint16_t eob;
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  tpl_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+#else
   av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
                      dst_stride);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
   tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
                is_cur_buf_hbd(xd));
 
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 97ed87a..4de0ce6 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -376,15 +376,25 @@
 
 // Compute the pixel domain distortion from diff on all visible 4x4s in the
 // transform block.
-static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                                      int blk_row, int blk_col,
-                                      const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize,
-                                      unsigned int *block_mse_q8) {
+static INLINE int64_t pixel_diff_dist(
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    const AV1_COMMON *cm,
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    const MACROBLOCK *x, int plane, int blk_row, int blk_col,
+    const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+    unsigned int *block_mse_q8) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  const int txb_cols = block_size_wide[tx_bsize];
+  const int txb_rows = block_size_high[tx_bsize];
+
+  get_visible_dimensions(xd, plane, blk_col, blk_row, txb_cols, txb_rows,
+                         cm->width, cm->height, &visible_cols, &visible_rows);
+#else
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
                      NULL, &visible_cols, &visible_rows);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   const int diff_stride = block_size_wide[plane_bsize];
   const int16_t *diff = x->plane[plane].src_diff;
 
@@ -396,7 +406,11 @@
       *block_mse_q8 =
           (unsigned int)((256 * sse) / (visible_cols * visible_rows));
     else
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+      *block_mse_q8 = 0;
+#else
       *block_mse_q8 = UINT_MAX;
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   }
   return sse;
 }
@@ -451,7 +465,11 @@
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 #endif  // CONFIG_EXTQUANT
 
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+  *dist = pixel_diff_dist(
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+      cm,
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+      x, 0, 0, 0, bsize, bsize, NULL);
 
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
@@ -1181,6 +1199,11 @@
                       cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
       av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
                         &quant_param_intra);
+
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+      av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                       cm->width, cm->height, best_tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
       av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
                       &txfm_param_intra, &quant_param_intra);
       if (quant_param_intra.use_optimize_b) {
@@ -1203,6 +1226,7 @@
   }
 }
 
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 static unsigned pixel_dist_visible_only(
     const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
     const int src_stride, const uint8_t *dst, const int dst_stride,
@@ -1226,6 +1250,7 @@
                          visible_rows);
   return sse;
 }
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
 // Compute the pixel domain distortion from src and dst on all visible 4x4s in
 // the
@@ -1234,15 +1259,26 @@
                            int plane, const uint8_t *src, const int src_stride,
                            const uint8_t *dst, const int dst_stride,
                            int blk_row, int blk_col,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
                            const BLOCK_SIZE plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
                            const BLOCK_SIZE tx_bsize) {
   int txb_rows, txb_cols, visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  const AV1_COMMON *const cm = &cpi->common;
+  txb_cols = block_size_wide[tx_bsize];
+  txb_rows = block_size_high[tx_bsize];
+
+  get_visible_dimensions(xd, plane, blk_col, blk_row, txb_cols, txb_rows,
+                         cm->width, cm->height, &visible_cols, &visible_rows);
+#else
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
   assert(visible_rows > 0);
   assert(visible_cols > 0);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
   unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
                                          dst_stride, tx_bsize, txb_rows,
@@ -1252,7 +1288,10 @@
 }
 
 static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
-                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int plane,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                                           BLOCK_SIZE plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
                                            int block, int blk_row, int blk_col,
                                            TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1293,7 +1332,11 @@
                               cpi->common.features.reduced_tx_set_used);
 
   return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
-                         blk_row, blk_col, plane_bsize, tx_bsize);
+                         blk_row, blk_col,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                         plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                         tx_bsize);
 }
 
 static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
@@ -2294,6 +2337,12 @@
   const int use_intra_txb_hash =
       cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
       !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
+
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size, cm->width,
+                   cm->height, best_tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
   if (use_intra_txb_hash) {
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
@@ -2353,8 +2402,12 @@
                           &per_px_mean, &dc_only_blk);
     if (best_rd_stats->skip_txfm == 1) return;
   } else {
-    block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
-                                txsize_to_bsize[tx_size], &block_mse_q8);
+    block_sse = pixel_diff_dist(
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+        cm,
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+        x, plane, blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size],
+        &block_mse_q8);
     assert(block_mse_q8 != UINT_MAX);
   }
 
@@ -2420,6 +2473,11 @@
                                : AV1_XFORM_QUANT_FP,
                   cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  const int is_border_block = get_visible_dimensions(
+      xd, plane, blk_col, blk_row, txw, txh, cm->width, cm->height, NULL, NULL);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
   // Iterate through all transform type candidates.
   for (int idx = 0; idx < TX_TYPES; ++idx) {
     const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
@@ -2433,6 +2491,12 @@
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
 
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+    if (is_border_block)
+      av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                       cm->width, cm->height, tx_type);
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+
     if (!dc_only_blk)
       av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
     else
@@ -2464,8 +2528,12 @@
       this_rd_stats.dist = this_rd_stats.sse = block_sse;
     } else if (dc_only_blk) {
       this_rd_stats.sse = block_sse;
-      this_rd_stats.dist = dist_block_px_domain(
-          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      this_rd_stats.dist =
+          dist_block_px_domain(cpi, x, plane,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                               plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                               block, blk_row, blk_col, tx_size);
     } else if (use_transform_domain_distortion) {
       dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                            &this_rd_stats.sse);
@@ -2491,8 +2559,12 @@
       if (tx_size != TX_64X64 || !is_high_energy ||
           (sse_diff * 2) < this_rd_stats.sse) {
         const int64_t tx_domain_dist = this_rd_stats.dist;
-        this_rd_stats.dist = dist_block_px_domain(
-            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+        this_rd_stats.dist =
+            dist_block_px_domain(cpi, x, plane,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                                 plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                                 block, blk_row, blk_col, tx_size);
         // For high energy blocks, occasionally, the pixel domain distortion
         // can be artificially low due to clamping at reconstruction stage
         // even when inverse transform output is hugely different from the
@@ -2593,8 +2665,12 @@
   p->dqcoeff = best_dqcoeff;
 
   if (calc_pixel_domain_distortion_final && best_eob) {
-    best_rd_stats->dist = dist_block_px_domain(
-        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->dist =
+        dist_block_px_domain(cpi, x, plane,
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                             plane_bsize,
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                             block, blk_row, blk_col, tx_size);
     best_rd_stats->sse = block_sse;
   }
 
@@ -3041,7 +3117,9 @@
 
   if (!is_inter) {
     av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   }
 
   TXB_CTX txb_ctx;
@@ -3629,10 +3707,12 @@
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
+#if !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
   if (is_inter) {
     for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
       av1_subtract_plane(x, plane_bsize, plane);
   }
+#endif  // !CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
 
   const int skip_trellis = 0;
   const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
@@ -3744,7 +3824,12 @@
   rd_stats->rate = mode_rate;
 
   // cost and distortion
-  av1_subtract_plane(x, bsize, 0);
+  av1_subtract_plane(x, bsize, 0
+#if CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+                     ,
+                     cm->width, cm->height
+#endif  // CONFIG_E191_PART2_OFS_PRED_RES_HANDLE
+  );
   if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
       !xd->lossless[mbmi->segment_id]) {
     av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 9466391..f0c6f51 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -151,6 +151,9 @@
 # This is an encode-only change.
 set_aom_config_var(CONFIG_SCC_DETERMINATION 1
                    "Enable the screen content tools determination improvement.")
+# This is an encode-only change.
+set_aom_config_var(CONFIG_E191_PART2_OFS_PRED_RES_HANDLE 1
+                   "Enable outside frame boundary block handling")
 #
 # Variables in this section control optional features of the build system.
 #