rtc-screen:Populate source_sad_nonrd using src_sad_blk_64x64

In parent version, x->content_state_sb.source_sad_nonrd is
populated based on cpi->rc.frame_source_sad or residual
source variance computed at superblock level assuming zero
MV. In this CL, cpi->src_sad_blk_64x64 is used to identify
the superblocks with zero source SAD and hence residual
variance computation is avoided for these cases. If
superblock SAD is non-zero, the population of
x->content_state_sb.source_sad_nonrd is same as that of
parent version.

This CL is seen to give speed-up for screen contents
though it is applicable for non-screen contents as well.
The CL is bit exact as zero source SAD implies zero
residual source variance and zero residual source SSE.

For rtc-screen,
     Instruction Count        BD-Rate Loss(%)
cpu     Reduction(%)    avg.psnr   ovr.psnr    ssim
9         0.303         0.0000     0.0000    0.0000
10        0.325         0.0000     0.0000    0.0000

Change-Id: I5740d245f94ae8f1adbead43d91124357602f2f3
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 548709a..a3862b4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -753,6 +753,44 @@
   }
 }
 
+/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+                                             int mi_col) {
+  if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128)
+                                   ? (cm->seq_params->mib_size >> 1)
+                                   : cm->seq_params->mib_size;
+  const int num_blk_64x64_cols =
+      (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int num_blk_64x64_rows =
+      (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+  const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
+  const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
+  uint64_t curr_sb_sad = UINT64_MAX;
+  const uint64_t *const src_sad_blk_64x64_data =
+      &cpi->src_sad_blk_64x64[blk_64x64_col_index +
+                              blk_64x64_row_index * num_blk_64x64_cols];
+  if (cm->seq_params->sb_size == BLOCK_128X128 &&
+      blk_64x64_col_index + 1 < num_blk_64x64_cols &&
+      blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+    // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
+    // superblock
+    curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols] +
+                  src_sad_blk_64x64_data[num_blk_64x64_cols + 1];
+  } else if (cm->seq_params->sb_size == BLOCK_64X64) {
+    curr_sb_sad = src_sad_blk_64x64_data[0];
+  }
+  return curr_sb_sad;
+}
+
 /*!\brief Determine whether grading content can be skipped based on sad stat
  *
  * \ingroup partition_search
@@ -762,29 +800,30 @@
 static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
                                                   MACROBLOCK *const x,
                                                   int mi_row, int mi_col) {
+  const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
+  if (curr_sb_sad == UINT64_MAX) return true;
+  if (curr_sb_sad == 0) {
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+    return false;
+  }
   AV1_COMMON *const cm = &cpi->common;
   bool do_calc_src_content = true;
 
   if (cpi->oxcf.speed < 9) return do_calc_src_content;
 
-  // TODO(yunqing): Need to consider 4 64x64 results if later this is used for
-  // 128x128 sb size.
-  if (cpi->src_sad_blk_64x64 != NULL && AOMMIN(cm->width, cm->height) < 360) {
-    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
-                                  ? (cm->seq_params->mib_size >> 1)
-                                  : cm->seq_params->mib_size;
-    const int sb_cols =
-        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
-    const int sbi_col = mi_col / sb_size_by_mb;
-    const int sbi_row = mi_row / sb_size_by_mb;
+  // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size.
+  if (AOMMIN(cm->width, cm->height) < 360) {
+    // Derive Average 64x64 block source SAD from SB source SAD
+    const uint64_t avg_64x64_blk_sad =
+        (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2)
+                                                   : curr_sb_sad;
+
     // The threshold is determined based on kLowSad and kHighSad threshold and
     // test results.
     const uint64_t thresh_low = 15000;
     const uint64_t thresh_high = 40000;
-    const uint64_t blk_sad =
-        cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
 
-    if (blk_sad > thresh_low && blk_sad < thresh_high) {
+    if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) {
       do_calc_src_content = false;
       // Note: set x->content_state_sb.source_sad_rd as well if this is extended
       // to RTC rd path.