rtc: Allow for split to 8x8 blocks for speed >=9 low-resoln

For sf.prefer_large_partition_blocks >= 3 (speed >= 9) and
low-resolutions: split to 8x8 is disallowed. This can
cause visual quality artifacts near moving boundary, as
seen in, for example, desktopqvga_320_240.y4m.

This patch allows for split to 8x8 for some superblocks,
depending on sb source_sad content, and if so, 4x4 avg
is used (instead of the default 8x8 avg used for delta frames)
to get the 8x8 variances. The decision to split 16x16 block
is then based on the min/max over the 4 8x8 subblock variances.
This is done to get better signal on whether moving boundary
exists within 16x16 block.

This significantly reduces artifact in desktopqvga,
overall small bdrate change, and small speed loss.

Stats changed for speed 9/10 rtc_derf.
avg_psnr/ovr_psnr/ssim,  IC speedup
speed 9:  -0.40/-0.41/-0.39,  -1.0
speed 10: -0.25/-0.37/-0.38,  -1.1

Change-Id: Ib132aab11ab8083c2ddd666bcd433509fee6c950
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 0092294..53d3d2a 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -449,7 +449,8 @@
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                                           int q, int content_lowsumdiff,
                                           int source_sad_nonrd,
-                                          int source_sad_rd, int segment_id) {
+                                          int source_sad_rd, int segment_id,
+                                          uint64_t blk_sad) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -585,7 +586,7 @@
       }
     }
     if (cm->width * cm->height <= 352 * 288) {
-      thresholds[3] = INT32_MAX;
+      thresholds[3] = INT64_MAX;
       if (segment_id == 0) {
         thresholds[1] <<= 2;
         thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4;
@@ -593,6 +594,16 @@
         thresholds[1] <<= 1;
         thresholds[2] <<= 3;
       }
+      // Allow for split to 8x8 for superblocks where part of it has
+      // moving boudary. So allow for sb with source_sad above threshold,
+      // and avoid very large source_sad or high source content, to avoid
+      // too many 8x8 within superblock.
+      if (cpi->rc.avg_source_sad < 25000 && blk_sad > 20000 &&
+          blk_sad < 60000) {
+        thresholds[2] = thresholds[2] >> 1;
+        thresholds[3] = thresholds[2] << 3;
+      }
+
       // Condition the increase of partition thresholds on the segment
       // and the content. Avoid the increase for superblocks which have
       // high source sad, unless the whole frame has very high motion
@@ -916,7 +927,7 @@
     return;
   } else {
     set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
-                       0, 0);
+                       0, 0, 0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -1000,13 +1011,13 @@
     AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, VP16x16 *vt2,
     PART_EVAL_STATUS *force_split, int avg_16x16[][4], int maxvar_16x16[][4],
     int minvar_16x16[][4], int *variance4x4downsample, int64_t *thresholds,
-    uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride) {
+    uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride,
+    int use_4x4avg) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int is_key_frame = frame_is_intra_only(cm);
   const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
-  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
   const int compute_minmax_variance = 0;
   const int segment_id = xd->mi[0]->segment_id;
   int pixels_wide = 128, pixels_high = 128;
@@ -1056,7 +1067,8 @@
             maxvar_16x16[m][i] =
                 vt->split[m].split[i].split[j].part_variances.none.variance;
           if (vt->split[m].split[i].split[j].part_variances.none.variance >
-              thresholds[3]) {
+                  thresholds[3] &&
+              !use_4x4avg) {
             // 16X16 variance is above threshold for split, so force split to
             // 8x8 for this 16x16 block (this also forces splits for upper
             // levels).
@@ -1088,7 +1100,10 @@
             }
           }
         }
-        if (is_key_frame) {
+        if (is_key_frame ||
+            (use_4x4avg &&
+             vt->split[m].split[i].split[j].part_variances.none.variance >
+                 (thresholds[2] << 1))) {
           force_split[split_index] = PART_EVAL_ALL;
           // Go down to 4x4 down-sampling for variance.
           variance4x4downsample[i2 + j] = 1;
@@ -1188,8 +1203,9 @@
 // Decides whether to split or merge a 16x16 partition block in variance based
 // partitioning based on the 8x8 sub-block variances.
 static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
-    VP16x16 *var_16x16_info, int64_t threshold16) {
+    VP16x16 *var_16x16_info, int64_t threshold16, int is_key_frame) {
   int max_8x8_var = 0, min_8x8_var = INT_MAX;
+  int fac_shift = 2;
   for (int k = 0; k < 4; k++) {
     get_variance(&var_16x16_info->split[k].part_variances.none);
     int this_8x8_var = var_16x16_info->split[k].part_variances.none.variance;
@@ -1199,8 +1215,11 @@
   // If the difference between maximum and minimum sub-block variances is high,
   // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
   // only PARTITION_NONE. The shift factor for threshold16 has been derived
-  // empirically.
-  return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+  // empirically. For delta frames: add condition that min_8x8_var is small,
+  // this is to target moving boundary withing 16x16 block, so some area should
+  // be small variance/stationary.
+  return ((max_8x8_var - min_8x8_var) > (threshold16 << fac_shift) &&
+          (!is_key_frame && min_8x8_var < AOMMAX(400, (threshold16 >> 3))))
              ? PART_EVAL_ONLY_SPLIT
              : PART_EVAL_ONLY_NONE;
 }
@@ -1241,7 +1260,7 @@
   int avg_16x16[4][4];
   int maxvar_16x16[4][4];
   int minvar_16x16[4][4];
-  int64_t threshold_4x4avg;
+  int use_4x4avg;
   uint8_t *s;
   const uint8_t *d;
   int sp;
@@ -1276,27 +1295,36 @@
                             vbp_thresholds[2], vbp_thresholds[3],
                             vbp_thresholds[4] };
 
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  const int low_res = cm->width * cm->height <= 352 * 288;
   int variance4x4downsample[64];
   const int segment_id = xd->mi[0]->segment_id;
 
+  uint64_t blk_sad = 0;
+  if (cpi->src_sad_blk_64x64 != NULL) {
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  }
+
   if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
       cyclic_refresh_segment_id_boosted(segment_id)) {
     const int q =
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
     set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
                        x->content_state_sb.source_sad_nonrd,
-                       x->content_state_sb.source_sad_rd, 1);
+                       x->content_state_sb.source_sad_rd, 1, blk_sad);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
                        x->content_state_sb.low_sumdiff,
                        x->content_state_sb.source_sad_nonrd,
-                       x->content_state_sb.source_sad_rd, 0);
+                       x->content_state_sb.source_sad_rd, 0, blk_sad);
   }
 
-  // For non keyframes, disable 4x4 average for low resolution when speed = 8
-  threshold_4x4avg = INT64_MAX;
-
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
@@ -1381,13 +1409,19 @@
   if (cpi->noise_estimate.enabled)
     noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
 
-  if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+  if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3 && low_res &&
+      thresholds[3] < INT64_MAX) {
+    use_4x4avg = 1;
+    CHECK_MEM_ERROR(cm, vt2, aom_calloc(16, sizeof(*vt2)));
+  } else {
+    use_4x4avg = 0;
+  }
+
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   fill_variance_tree_leaves(cpi, x, vt, vt2, force_split, avg_16x16,
                             maxvar_16x16, minvar_16x16, variance4x4downsample,
-                            thresholds, s, sp, d, dp);
+                            thresholds, s, sp, d, dp, use_4x4avg);
 
   avg_64x64 = 0;
   for (m = 0; m < num_64x64_blocks; ++m) {
@@ -1409,8 +1443,10 @@
           get_variance(&vtemp->part_variances.none);
           if (vtemp->part_variances.none.variance > thresholds[3]) {
             force_split[split_index] =
-                cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
-                    ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+                cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var ||
+                        (!is_key_frame && use_4x4avg)
+                    ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3],
+                                                         is_key_frame)
                     : PART_EVAL_ONLY_SPLIT;
             force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
             force_split[m + 1] = PART_EVAL_ONLY_SPLIT;