Allintra: Prune SPLIT in var based partitioning

The sf vbp_prune_16x16_split_using_min_max_sub_blk_var is
introduced in var based partitioning to choose between SPLIT
or NONE partitioning based on the minimum and maximum sub-block
variances. This sf is currently enabled only for bsize 16X16.

For AVIF still-image encode,

             Encode Time      BD-Rate Loss(%)
cpu-used     Reduction(%)     psnr       ssim
    9           8.436        -0.7805    -0.4677

STATS_CHANGED

Change-Id: I8dc8726d25a15fd4579273851499ffaa4edf2d59
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index de276b1..a671ea7 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -536,6 +536,7 @@
     sf->rt_sf.nonrd_check_partition_merge_mode = 0;
     sf->rt_sf.hybrid_intra_pickmode = 0;
     sf->rt_sf.var_part_split_threshold_shift = 9;
+    sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
   }
 }
 
@@ -1973,6 +1974,7 @@
   rt_sf->sad_based_comp_prune = 0;
   rt_sf->tx_size_level_based_on_qstep = 0;
   rt_sf->reduce_zeromv_mvres = false;
+  rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index ba84f30..97a5cf6 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1508,6 +1508,16 @@
 
   // Reduce the mv resolution for zero mv if the variance is low.
   bool reduce_zeromv_mvres;
+
+  // Avoid the partitioning of a 16x16 block in variance based partitioning
+  // (VBP) by making use of minimum and maximum sub-block variances.
+  // For allintra encode, this speed feature reduces instruction count by 5.39%
+  // for speed 9 on a typical video dataset with coding performance gain
+  // of 1.44%.
+  // For AVIF image encode, this speed feature reduces encode time
+  // by 8.44% for speed 9 on a typical image dataset with coding performance
+  // gain of 0.78%.
+  bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index c0eabad..b63ee03 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -39,6 +39,8 @@
   PART_EVAL_ALL = 0,
   // Force PARTITION_SPLIT
   PART_EVAL_ONLY_SPLIT = 1,
+  // Force PARTITION_NONE
+  PART_EVAL_ONLY_NONE = 2
 } UENUM1BYTE(PART_EVAL_STATUS);
 
 typedef struct {
@@ -174,6 +176,12 @@
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
+  if (mi_col + bs_width_check <= tile->mi_col_end &&
+      mi_row + bs_height_check <= tile->mi_row_end &&
+      force_split == PART_EVAL_ONLY_NONE) {
+    set_block_size(cpi, mi_row, mi_col, bsize);
+    return 1;
+  }
   if (force_split == PART_EVAL_ONLY_SPLIT) return 0;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
@@ -1100,6 +1108,26 @@
   }
 }
 
+// Decides whether to split or merge a 16x16 partition block in variance based
+// partitioning based on the 8x8 sub-block variances.
+static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+    VP16x16 *var_16x16_info, int64_t threshold16) {
+  int max_8x8_var = 0, min_8x8_var = INT_MAX;
+  for (int k = 0; k < 4; k++) {
+    get_variance(&var_16x16_info->split[k].part_variances.none);
+    int this_8x8_var = var_16x16_info->split[k].part_variances.none.variance;
+    max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var);
+    min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var);
+  }
+  // If the difference between maximum and minimum sub-block variances is high,
+  // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
+  // only PARTITION_NONE. The shift factor for threshold16 has been derived
+  // empirically.
+  return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+             ? PART_EVAL_ONLY_SPLIT
+             : PART_EVAL_ONLY_NONE;
+}
+
 int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
                                       ThreadData *td, MACROBLOCK *x, int mi_row,
                                       int mi_col) {
@@ -1278,7 +1306,10 @@
           // to split. This also forces a split on the upper levels.
           get_variance(&vtemp->part_variances.none);
           if (vtemp->part_variances.none.variance > thresholds[3]) {
-            force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+            force_split[split_index] =
+                cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
+                    ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+                    : PART_EVAL_ONLY_SPLIT;
             force_split[5 + m2 + i] = PART_EVAL_ONLY_SPLIT;
             force_split[m + 1] = PART_EVAL_ONLY_SPLIT;
             force_split[0] = PART_EVAL_ONLY_SPLIT;