Use better simple_motion_search_split on speed 3+

The current model can lead to high PSNR drop on low bitrate high
resolution videos. Using a higher quality model solves this issue and
leads to some speed gain on lowres and midres.

Performance on speed 3:
 TESTSET | AVG_PSNR | OVR_PSNR |    SSIM |    VMAF | AVG_SPD | OVR_SPD
  LOWRES |  +0.014% |  +0.016% | +0.115% | +0.044% | +5.547% | +7.604%
  MIDRES |  -0.213% |  -0.217% | -0.218% | -0.382% | +3.049% | +4.310%
   HDRES |  -0.685% |  -0.669% | -0.543% | -1.331% | -0.368% | +0.041%

BUG=aomedia:2365

STATS_CHANGED

Change-Id: I798abc39ae35483fb0a15ffdabec1cf55058179f
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 8de19cc..926e30a 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2701,7 +2701,7 @@
   // Use simple_motion_search to prune partitions. This must be done prior to
   // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
   const int try_split_only =
-      cpi->sf.simple_motion_search_split_only && do_square_split &&
+      cpi->sf.simple_motion_search_split && do_square_split &&
       bsize >= BLOCK_8X8 && mi_row + mi_size_high[bsize] <= cm->mi_rows &&
       mi_col + mi_size_wide[bsize] <= cm->mi_cols && !frame_is_intra_only(cm) &&
       !av1_superres_scaled(cm);
@@ -4244,7 +4244,7 @@
     PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
     pc_root->index = 0;
 
-    if ((sf->simple_motion_search_split_only ||
+    if ((sf->simple_motion_search_split ||
          sf->simple_motion_search_prune_rect ||
          sf->simple_motion_search_early_term_none ||
          sf->firstpass_simple_motion_search_early_term) &&
diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index f21f0d7..3d2a947 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c
@@ -25,116 +25,7 @@
     AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
     int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get);
 
-// Performs a simple_motion_search with a single reference frame and extract
-// the variance of residues. Here features is assumed to be a length 6 array.
-// After this function is called, we will store the following in to features:
-// features[0] = log(1 + dc_q**2/256)
-// features[1] = log(1 + variance_of_residue)
-// for i in [2, 3, 4, 5]:
-//  features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
-static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize,
-                                 float *features) {
-  // TODO(chiyotsai@google.com): The data this model trained on did not also use
-  // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
-  // model with the correct data should give better performance.
-  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  // Perform a single motion search in Y_PLANE to make a prediction
-  const int use_subpixel = 0;
-
-  // Start getting the features
-  int f_idx = 0;
-
-  // Q_INDEX
-  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  aom_clear_system_state();
-  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-
-  // VARIANCE
-  unsigned int sse = 0;
-  unsigned int var = 0;
-  const MV ref_mv_full = { .row = 0, .col = 0 };
-  av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full,
-                            use_subpixel, &sse, &var);
-  aom_clear_system_state();
-  features[f_idx++] = logf(1.0f + (float)var);
-
-  // Regional
-  const uint8_t *src = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *dst = xd->plane[0].dst.buf;
-  const int dst_stride = xd->plane[0].dst.stride;
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
-  int r_idx = 0;
-  for (r_idx = 0; r_idx < 4; r_idx++) {
-    const int x_idx = (r_idx & 1) * bw / 2;
-    const int y_idx = (r_idx >> 1) * bh / 2;
-    const int src_offset = y_idx * src_stride + x_idx;
-    const int dst_offset = y_idx * dst_stride + x_idx;
-    const unsigned int sub_var = cpi->fn_ptr[subsize].vf(
-        src + src_offset, src_stride, dst + dst_offset, dst_stride, &sse);
-    aom_clear_system_state();
-    const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
-    features[f_idx++] = var_ratio;
-  }
-}
-
-static void simple_motion_search_based_split_fast(
-    AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_rectangular_split,
-    int *do_square_split) {
-  aom_clear_system_state();
-  const NN_CONFIG *nn_config = NULL;
-  float split_only_thresh = 1.0f;
-  if (bsize == BLOCK_128X128) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_128;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_128;
-  } else if (bsize == BLOCK_64X64) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_64;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_64;
-  } else if (bsize == BLOCK_32X32) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_32;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_32;
-  } else if (bsize == BLOCK_16X16) {
-    nn_config = &av1_simple_motion_search_based_split_nn_config_16;
-    split_only_thresh = av1_simple_motion_search_based_split_thresh_16;
-  } else if (bsize == BLOCK_8X8) {
-    return;
-  } else {
-    assert(0 && "Unexpected block size in simple_motion_based_split");
-    return;
-  }
-
-  float features[FEATURE_SIZE_SMS_SPLIT_FAST] = { 0.0f };
-  float score = 0.0f;
-  get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
-  av1_nn_predict(features, nn_config, &score);
-  aom_clear_system_state();
-
-  if (score > split_only_thresh) {
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_rectangular_split = 0;
-  }
-  if (cpi->sf.simple_motion_search_split_only >= 2) {
-    if (score < -split_only_thresh) *do_square_split = 0;
-    // For larger scores (>split_only_thresh), none and rectangular partitions
-    // are skipped. As score reduces, possibility of split decreases. Hence
-    // for near larger scores (.875 * split_only_thresh to split_only_thresh)
-    // none partition is disabled, but rectangular partitions are evaluated
-    // additionally.
-    if (score > (split_only_thresh * 0.875)) *partition_none_allowed = 0;
-  }
-}
-
-static int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_128X128: return 0;
     case BLOCK_64X64: return 1;
@@ -150,15 +41,6 @@
     int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
     int *partition_horz_allowed, int *partition_vert_allowed,
     int *do_rectangular_split, int *do_square_split) {
-  if (cpi->sf.simple_motion_search_split_speed >= 2) {
-    simple_motion_search_based_split_fast(
-        cpi, x, mi_row, mi_col, bsize, partition_none_allowed,
-        partition_horz_allowed, partition_vert_allowed, do_rectangular_split,
-        do_square_split);
-
-    return;
-  }
-
   aom_clear_system_state();
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -204,7 +86,7 @@
     *do_rectangular_split = 0;
   }
 
-  if (cpi->sf.simple_motion_search_split_only >= 2 && score < no_split_thresh) {
+  if (cpi->sf.simple_motion_search_split >= 2 && score < no_split_thresh) {
     *do_square_split = 0;
   }
 }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 5794983..ef312f6 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -117,6 +117,8 @@
     sf->ml_early_term_after_part_split_level = 1;
   }
 
+  // TODO(chiyotsai@google.com): Try to replace two pass partition search with
+  // other speed features.
   if (is_720p_or_larger && speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL_START &&
       speed < CONFIG_2PASS_PARTITION_SEARCH_LVL_END) {
     sf->two_pass_partition_search = 1;
@@ -128,7 +130,7 @@
     } else if (is_480p_or_larger) {
       sf->use_square_partition_only_threshold = BLOCK_64X64;
 
-      sf->simple_motion_search_split_only = 2;
+      sf->simple_motion_search_split = 2;
     } else {
       sf->use_square_partition_only_threshold = BLOCK_32X32;
     }
@@ -141,9 +143,6 @@
       sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
 
       sf->firstpass_simple_motion_search_early_term = 1;
-      // TODO(chiyotsai@google.com): Try to disable two pass partition search
-      // and turn on hdres
-      sf->simple_motion_search_split_speed = 1;
       sf->ml_early_term_after_part_split_level = 2;
     }
   }
@@ -172,7 +171,6 @@
   }
 
   if (speed >= 3) {
-    sf->simple_motion_search_split_speed = 2;
     sf->ml_early_term_after_part_split_level = 0;
     if (is_720p_or_larger) {
       sf->partition_search_breakout_dist_thr = (1 << 25);
@@ -186,11 +184,12 @@
         sf->two_pass_partition_search;
 
     // TODO(Venkat): Clean-up frame type dependency for
-    // simple_motion_search_split_only in partition search function and set the
+    // simple_motion_search_split in partition search function and set the
     // speed feature accordingly
-    // TODO(Venkat): Evaluate this speed feature for speed 1 & 2
-    sf->simple_motion_search_split_only =
-        cm->allow_screen_content_tools ? 1 : 2;
+    // TODO(any): The models and thresholds used by simple_motion_split is
+    // trained and tuned on speed 1 and 2. We might get better performance if we
+    // readjust them for speed 3 and 4.
+    sf->simple_motion_search_split = cm->allow_screen_content_tools ? 1 : 2;
   }
 
   if (speed >= 4) {
@@ -269,7 +268,7 @@
     // speed.
     sf->prune_single_motion_modes_by_simple_trans = 1;
 
-    sf->simple_motion_search_split_only = 1;
+    sf->simple_motion_search_split = 1;
     sf->simple_motion_search_early_term_none = 1;
 
     sf->disable_wedge_search_var_thresh = 0;
@@ -742,7 +741,6 @@
   sf->skip_obmc_in_uniform_mv_field = 0;
   sf->skip_wm_in_uniform_mv_field = 0;
   sf->adaptive_interp_filter_search = 0;
-  sf->simple_motion_search_split_speed = 2;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -768,7 +766,7 @@
   for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
     sf->ml_partition_search_breakout_thresh[i] = -1;  // -1 means not enabled.
   }
-  sf->simple_motion_search_split_only = 0;
+  sf->simple_motion_search_split = 0;
   sf->simple_motion_search_prune_rect = 0;
   sf->simple_motion_search_early_term_none = 0;
 
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 358f4c9..ef86923 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -648,14 +648,11 @@
   int simple_motion_search_prune_rect;
 
   // Perform simple motion search before none_partition to decide if we
-  // want to split directly without trying other partition types.
-  int simple_motion_search_split_only;
-
-  // Determines the type of model used by simple_motion_search_split_only. Only
-  // valids when simple_motion_search_split_only is >= 1. Set to 1 for the
-  // slower model that uses 5 subpixel searches, and 2 for the faster model that
-  // uses 1 fullpixel search.
-  int simple_motion_search_split_speed;
+  // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+  // model is disabled. If set to 1, the model attempts to perform
+  // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+  // PARTITION_SPLIT.
+  int simple_motion_search_split;
 
   // Use features from simple_motion_search to terminate prediction block
   // partition after PARTITION_NONE