Skip wavelet energy calculation in first pass

The stats based on wavelet energy which is calculated in
first pass is required in case of perceptual deltaq mode
and to predict flat GOP structure (through ML model).
Thus wavelet energy calculation is enabled only
for those cases and disabled for others.

          Instruction Count
cpu-used      Reduction(%)
   3              0.40
   4              0.59
   5              1.14
   6              1.50

Change-Id: Iac98f580026a155e9442fb7147cf6d80ddf68517
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index c6e3852..01f2959 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -1501,9 +1501,7 @@
 
   cpi->skip_tpl_setup_stats = 0;
 #if !CONFIG_REALTIME_ONLY
-  const int use_one_pass_rt_params = has_no_stats_stage(cpi) &&
-                                     oxcf->mode == REALTIME &&
-                                     gf_cfg->lag_in_frames == 0;
+  const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
   if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, av1_get_second_pass_params_time);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 0d2e0c3..afeec65 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -51,6 +51,7 @@
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encode_strategy.h"
@@ -3086,6 +3087,44 @@
     av1_set_screen_content_options(cpi, features);
   }
 
+#if !CONFIG_REALTIME_ONLY
+  if (is_one_pass_rt_params(cpi) == 0) {
+    TWO_PASS *const twopass = &cpi->ppi->twopass;
+    const FIRSTPASS_STATS *const total_stats =
+        twopass->stats_buf_ctx->total_stats;
+    if ((oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) &&
+        is_fp_wavelet_energy_invalid(total_stats)) {
+      const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+                              ? cpi->initial_mbs
+                              : cm->mi_params.MBs;
+      const YV12_BUFFER_CONFIG *const unfiltered_source =
+          cpi->unfiltered_source;
+      const uint8_t *const src = unfiltered_source->y_buffer;
+      const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+      const int stride = unfiltered_source->y_stride;
+      const BLOCK_SIZE fp_block_size =
+          get_fp_block_size(cpi->is_screen_content_type);
+      const int fp_block_size_width = block_size_wide[fp_block_size];
+      const int fp_block_size_height = block_size_high[fp_block_size];
+      const int num_unit_cols =
+          get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+      const int num_unit_rows = get_num_blocks(unfiltered_source->y_crop_height,
+                                               fp_block_size_height);
+      const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+      const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+      int64_t frame_avg_wavelet_energy = 0;
+      for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+        for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+              src + c8 * 8 + r8 * 8 * stride, stride, hbd);
+        }
+      }
+      twopass->frame_avg_haar_energy =
+          log(((double)frame_avg_wavelet_energy / num_mbs) + 1.0);
+    }
+  }
+#endif
+
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 89c1ac1..fe6e76f 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3325,6 +3325,11 @@
   return (rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200);
 }
 
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
 // Check if statistics generation stage
 static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
@@ -3357,8 +3362,14 @@
       IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
   return (cpi->oxcf.pass == 0 && !cpi->ppi->lap_enabled);
 }
+
 /*!\cond */
 
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+  return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+         cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
 // Function return size of frame stats buffer
 static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 52dc514..900cfa0 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -365,6 +365,12 @@
   return use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg);
 }
 
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+  return (use_ml_model_to_decide_flat_gop(&oxcf->rc_cfg) &&
+          can_disable_altref(&oxcf->gf_cfg)) ||
+         (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL);
+}
+
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
 // Computes and returns the intra pred error of a block.
@@ -489,16 +495,26 @@
   // Accumulate the intra error.
   stats->intra_error += (int64_t)this_intra_error;
 
-  const int hbd = is_cur_buf_hbd(xd);
-  const int stride = x->plane[0].src.stride;
-  const int num_8x8_rows = block_size_high[fp_block_size] / 8;
-  const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
-  const uint8_t *buf = x->plane[0].src.buf;
-  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
-    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
-      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+  // Stats based on wavelet energy is used in the following cases :
+  // 1. ML model which predicts if a flat structure (golden-frame only structure
+  // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+  // constant quality mode under certain conditions.
+  // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+  // Thus, wavelet energy calculation is enabled for the above cases.
+  if (calc_wavelet_energy(&cpi->oxcf)) {
+    const int hbd = is_cur_buf_hbd(xd);
+    const int stride = x->plane[0].src.stride;
+    const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+    const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+    const uint8_t *buf = x->plane[0].src.buf;
+    for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+      for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+        stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+            buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+      }
     }
+  } else {
+    stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
   }
 
   return this_intra_error;
@@ -1197,7 +1213,8 @@
 
   // Unit size for the first pass encoding.
   const BLOCK_SIZE fp_block_size =
-      cpi->is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16;
+      get_fp_block_size(cpi->is_screen_content_type);
+
   // Number of rows in the unit size.
   // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16.
   const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index 2e26d13..122912f 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h
@@ -352,6 +352,15 @@
 struct AV1EncoderConfig;
 struct TileDataEnc;
 
+static INLINE int is_fp_wavelet_energy_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+  return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
 int av1_get_unit_rows_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 int av1_get_unit_cols_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index d9ba423..e3639f7 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -44,8 +44,11 @@
 #define DEFAULT_GF_BOOST 2000
 #define GROUP_ADAPTIVE_MAXQ 1
 
-#define IS_FP_STATS_TO_PREDICT_FLAT_GOP_INVALID(fp_stats) \
-  (((fp_stats)->tr_coded_error < 0) || ((fp_stats)->pcnt_third_ref < 0))
+static INLINE int is_fp_stats_to_predict_flat_gop_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  return ((fp_stats->tr_coded_error < 0) || (fp_stats->pcnt_third_ref < 0) ||
+          (fp_stats->frame_avg_wavelet_energy < 0));
+}
 
 static void init_gf_stats(GF_GROUP_STATS *gf_stats);
 
@@ -2432,7 +2435,7 @@
     FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
     // TODO(urvang): Improve and use model for VBR, CQ etc as well.
     if (use_alt_ref && use_ml_model_to_decide_flat_gop(rc_cfg) &&
-        !IS_FP_STATS_TO_PREDICT_FLAT_GOP_INVALID(total_stats)) {
+        !is_fp_stats_to_predict_flat_gop_invalid(total_stats)) {
       aom_clear_system_state();
       float features[21];
       get_features_from_gf_stats(
@@ -3395,8 +3398,13 @@
   // The multiplication by 256 reverses a scaling factor of (>> 8)
   // applied when combining MB error values for the frame.
   twopass->mb_av_energy = log((this_frame_ptr->intra_error / num_mbs) + 1.0);
-  twopass->frame_avg_haar_energy =
-      log((this_frame_ptr->frame_avg_wavelet_energy / num_mbs) + 1.0);
+
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+  if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+    twopass->frame_avg_haar_energy =
+        log((this_frame_ptr->frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
 
   // Set the frame content type flag.
   if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
@@ -3417,7 +3425,7 @@
     const GFConfig *const gf_cfg = &cpi->oxcf.gf_cfg;
     const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
     if (use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg) &&
-        IS_FP_STATS_TO_PREDICT_FLAT_GOP_INVALID(total_stats)) {
+        is_fp_stats_to_predict_flat_gop_invalid(total_stats)) {
       // warn(
       //     "First pass stats required in the ML model to predict a flat GOP "
       //     "structure is invalid. Continuing encoding by disabling the ML "
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index d2f728d..3b95637 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -284,11 +284,6 @@
   aom_free(tf_data->pred);
 }
 
-// Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
-  return (frame_length + mb_length - 1) / mb_length;
-}
-
 // Saves the state prior to temporal filter process.
 // Inputs:
 //   mbd: Pointer to the block for filtering.