Speed-up GOP length decision for speed=4

For speed=4, GOP length is decided based on tpl
stats of ARFs from base layer, base+1 layer and
base+2 layer.

cpu-used  Instruction Count     BD-Rate Loss(%)
           Reduction(%)     avg.psnr  ovr.psnr  ssim
   4         1.898          -0.0482   -0.0503  -0.0876

STATS_CHANGED

Change-Id: Ie2e11ef553af42c803b7d7202ac1002a6364890f
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index e9f4531..e425a73 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -1005,6 +1005,62 @@
   return 0;
 }
 
+static int is_shorter_gf_interval_better(AV1_COMP *cpi,
+                                         EncodeFrameParams *frame_params,
+                                         const EncodeFrameInput *frame_input) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+  int shorten_gf_interval;
+
+  if (gop_length_decision_method == 2) {
+    // GF group length is decided based on GF boost and tpl stats of ARFs from
+    // base layer, (base+1) layer.
+    shorten_gf_interval =
+        (rc->gfu_boost <
+         rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+        !av1_tpl_setup_stats(cpi, 3, frame_params, frame_input);
+  } else {
+    int do_complete_tpl = 1;
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    int is_temporal_filter_enabled =
+        (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+    if (is_temporal_filter_enabled) {
+      int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
+      FRAME_UPDATE_TYPE arf_update_type =
+          gf_group->update_type[gf_group->arf_index];
+      int is_forward_keyframe = 0;
+      av1_temporal_filter(cpi, arf_src_index, arf_update_type,
+                          is_forward_keyframe, NULL);
+      aom_extend_frame_borders(&cpi->alt_ref_buffer,
+                               av1_num_planes(&cpi->common));
+    }
+
+    if (gop_length_decision_method == 1) {
+      // Check if tpl stats of ARFs from base layer, (base+1) layer,
+      // (base+2) layer can decide the GF group length.
+      int gop_length_eval =
+          av1_tpl_setup_stats(cpi, 2, frame_params, frame_input);
+
+      if (gop_length_eval != 2) {
+        do_complete_tpl = 0;
+        shorten_gf_interval = !gop_length_eval;
+      }
+    }
+
+    if (do_complete_tpl) {
+      // Decide GF group length based on complete tpl stats.
+      shorten_gf_interval =
+          !av1_tpl_setup_stats(cpi, 1, frame_params, frame_input);
+      // Tpl stats is reused when the ARF is temporally filtered and GF
+      // interval is not shortened.
+      if (is_temporal_filter_enabled && !shorten_gf_interval)
+        cpi->tpl_data.skip_tpl_setup_stats = 1;
+    }
+  }
+  return shorten_gf_interval;
+}
+
 #define MIN_FWD_KF_INTERVAL 8
 #define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
 #define SMOOTH_FILT_LEN 7
@@ -3694,7 +3750,7 @@
     }
 
     if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
-        !(cpi->sf.tpl_sf.gop_length_decision_method == 2)) {
+        cpi->sf.tpl_sf.gop_length_decision_method != 3) {
       int this_idx = rc->frames_since_key + rc->gf_intervals[rc->cur_gf_index] -
                      rc->regions_offset - 1;
       int this_region =
@@ -3713,35 +3769,7 @@
         define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
         this_frame = this_frame_copy;
 
-        int is_temporal_filter_enabled = 0;
-        int shorten_gf_interval = 0;
-        if (!cpi->sf.tpl_sf.gop_length_decision_method) {
-          is_temporal_filter_enabled =
-              (rc->frames_since_key > 0 && gf_group->arf_index > -1);
-          if (is_temporal_filter_enabled) {
-            int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
-            FRAME_UPDATE_TYPE arf_update_type =
-                gf_group->update_type[gf_group->arf_index];
-            int is_forward_keyframe = 0;
-            av1_temporal_filter(cpi, arf_src_index, arf_update_type,
-                                is_forward_keyframe, NULL);
-            aom_extend_frame_borders(&cpi->alt_ref_buffer,
-                                     av1_num_planes(&cpi->common));
-          }
-          shorten_gf_interval =
-              !av1_tpl_setup_stats(cpi, 1, frame_params, frame_input);
-          // Tpl stats is reused when the ARF is temporally filtered and gf
-          // interval is not shortened.
-          if (is_temporal_filter_enabled && !shorten_gf_interval)
-            cpi->tpl_data.skip_tpl_setup_stats = 1;
-        } else {
-          // GOP length is decided based on GF boost and approximate tpl model
-          shorten_gf_interval =
-              (rc->gfu_boost <
-               rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
-              !av1_tpl_setup_stats(cpi, 2, frame_params, frame_input);
-        }
-        if (shorten_gf_interval) {
+        if (is_shorter_gf_interval_better(cpi, frame_params, frame_input)) {
           // A shorter gf interval is better.
           // TODO(jingning): Remove redundant computations here.
           max_gop_length = 16;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 8720f45..bd7ca67 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1007,6 +1007,7 @@
     sf->tpl_sf.prune_starting_mv = 2;
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
+    sf->tpl_sf.gop_length_decision_method = 1;
 
     sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
@@ -1058,7 +1059,7 @@
     sf->tpl_sf.prune_starting_mv = 3;
     sf->tpl_sf.use_y_only_rate_distortion = 1;
     sf->tpl_sf.subpel_force_stop = FULL_PEL;
-    sf->tpl_sf.gop_length_decision_method = 1;
+    sf->tpl_sf.gop_length_decision_method = 2;
 
     sf->winner_mode_sf.dc_blk_pred_level = 1;
   }
@@ -1089,7 +1090,7 @@
     sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
     sf->mv_sf.use_bsize_dependent_search_method = 1;
 
-    sf->tpl_sf.gop_length_decision_method = 2;
+    sf->tpl_sf.gop_length_decision_method = 3;
     sf->tpl_sf.disable_filtered_key_tpl = 1;
 
     sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index b85bcb2..750c6c6 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -377,9 +377,11 @@
 typedef struct TPL_SPEED_FEATURES {
   // GOP length adaptive decision.
   // If set to 0, tpl model decides whether a shorter gf interval is better.
-  // If set to 1, approximate tpl model and GF boost decide whether a
-  // shorter gf interval is better. If set to 2, gop length adaptive decision is
-  // disabled.
+  // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+  // (base+2) layer decide whether a shorter gf interval is better.
+  // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+  // decide whether a shorter gf interval is better.
+  // If set to 3, gop length adaptive decision is disabled.
   int gop_length_decision_method;
   // Prune the intra modes search by tpl.
   // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index dd53ac2..9d4f1d7 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -1434,6 +1434,26 @@
   }
 }
 
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+  switch (gop_eval) {
+    case 1:
+      // Allow larger GOP size if the base layer ARF has higher dependency
+      // factor than the intermediate ARF and both ARFs have reasonably high
+      // dependency factors.
+      return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+    case 2:
+      if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+        return 1;  // Don't shorten the gf interval
+      else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+        return 0;  // Shorten the gf interval
+      else
+        return 2;  // Cannot decide the gf interval, so redo the
+                   // tpl stats calculation.
+    case 3: return beta[0] > 1.1;
+    default: return 2;
+  }
+}
+
 int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
                         const EncodeFrameParams *const frame_params,
                         const EncodeFrameInput *const frame_input) {
@@ -1447,7 +1467,14 @@
   int bottom_index, top_index;
   EncodeFrameParams this_frame_params = *frame_params;
   TplParams *const tpl_data = &cpi->tpl_data;
-  int approx_gop_eval = (gop_eval == 2);
+  int approx_gop_eval = (gop_eval > 1);
+  int num_arf_layers = MAX_ARF_LAYERS;
+
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
 
   if (cpi->superres_mode != AOM_SUPERRES_NONE) {
     assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
@@ -1495,18 +1522,20 @@
   av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
                     cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
 
-  // When approx_gop_eval = 1 tpl stats calculation is done for base layer
-  // and the next layer ARF.
-  int frame_idx_end =
-      approx_gop_eval ? AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1)
-                      : tpl_gf_group_frames - 1;
+  const int gop_length = get_gop_length(gf_group);
   // Backward propagation from tpl_group_frames to 1.
-  for (int frame_idx = cpi->gf_frame_index; frame_idx <= frame_idx_end;
+  for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
        ++frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
+    // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+    // frames and for frames beyond gop length.
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
     init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
     if (mt_info->num_workers > 1) {
       tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
@@ -1521,12 +1550,16 @@
                              av1_num_planes(cm));
   }
 
-  for (int frame_idx = frame_idx_end; frame_idx >= cpi->gf_frame_index;
-       --frame_idx) {
+  for (int frame_idx = tpl_gf_group_frames - 1;
+       frame_idx >= cpi->gf_frame_index; --frame_idx) {
     if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
     mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
                         cm->mi_params.mi_cols);
   }
@@ -1589,12 +1622,7 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_tpl_setup_stats_time);
 #endif
-  if (approx_gop_eval) return beta[0] > 1.1;
-
-  // Allow larger GOP size if the base layer ARF has higher dependency factor
-  // than the intermediate ARF and both ARFs have reasonably high dependency
-  // factors.
-  return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+  return eval_gop_length(beta, gop_eval);
 }
 
 void av1_tpl_rdmult_setup(AV1_COMP *cpi) {