Prune uv intra mode with hog

Performance on hdres over 70 frames, vbr mode
 SPD_SET | AVG_PSNR | OVR_PSNR |   SSIM  |  SPD
    6    |  +0.187% |  +0.185% | +0.229% | +3.0%
    5    |  +0.123% |  +0.125% | +0.195% | +2.0%
    3    |  +0.088% |  +0.094% | +0.088% | +2.2%

STATS_CHANGED

Change-Id: I4fd8a3b643e67dbf826d7b248af5f6a409fd2081
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 0f9ecfe..b9f311e 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -9,11 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/intra_mode_search.h"
 #include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/palette.h"
+#include "av1/encoder/speed_features.h"
 #include "av1/encoder/tx_search.h"
 
 /*!\cond */
@@ -472,6 +474,8 @@
                                  cpi->optimize_seg_arr[mbmi->segment_id]);
     xd->cfl.store_y = 0;
   }
+  IntraModeSearchState intra_search_state;
+  init_intra_mode_search_state(&intra_search_state);
 
   // Search through all non-palette modes.
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
@@ -503,6 +507,26 @@
 
     if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
         intra_mode_cfg->enable_angle_delta) {
+      const SPEED_FEATURES *sf = &cpi->sf;
+      if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+          !intra_search_state.dir_mode_skip_mask_ready) {
+        static const float thresh[2][4] = {
+          { -1.2f, 0.0f, 0.0f, 1.2f },    // Interframe
+          { -1.2f, -1.2f, -0.6f, 0.4f },  // Intraframe
+        };
+        const int is_chroma = 1;
+        const int is_intra_frame = frame_is_intra_only(cm);
+        prune_intra_mode_with_hog(
+            x, bsize,
+            thresh[is_intra_frame]
+                  [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+            intra_search_state.directional_mode_skip_mask, is_chroma);
+        intra_search_state.dir_mode_skip_mask_ready = 1;
+      }
+      if (intra_search_state.directional_mode_skip_mask[mode]) {
+        continue;
+      }
+
       // Search through angle delta
       const int rate_overhead =
           mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
@@ -884,9 +908,10 @@
     if (sf->intra_sf.intra_pruning_with_hog &&
         !intra_search_state->dir_mode_skip_mask_ready) {
       const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
-      prune_intra_mode_with_hog(x, bsize,
-                                thresh[sf->intra_sf.intra_pruning_with_hog - 1],
-                                intra_search_state->directional_mode_skip_mask);
+      const int is_chroma = 0;
+      prune_intra_mode_with_hog(
+          x, bsize, thresh[sf->intra_sf.intra_pruning_with_hog - 1],
+          intra_search_state->directional_mode_skip_mask, is_chroma);
       intra_search_state->dir_mode_skip_mask_ready = 1;
     }
     if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
@@ -1030,9 +1055,10 @@
     // Less aggressive thresholds are used here than those used in inter frame
     // encoding.
     const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+    const int is_chroma = 0;
     prune_intra_mode_with_hog(
         x, bsize, thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
-        directional_mode_skip_mask);
+        directional_mode_skip_mask, is_chroma);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
   pmi->palette_size[0] = 0;
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index 86a546e..cc2a87b0 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -254,6 +254,14 @@
                              int *val_count_8bit, int *num_color_bins,
                              int *num_colors);
 
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  memset(intra_search_state, 0, sizeof(*intra_search_state));
+  intra_search_state->rate_uv_intra = INT_MAX;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index f901130..8ec7cb3 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -191,18 +191,24 @@
 
 static AOM_INLINE void prune_intra_mode_with_hog(
     const MACROBLOCK *x, BLOCK_SIZE bsize, float th,
-    uint8_t *directional_mode_skip_mask) {
+    uint8_t *directional_mode_skip_mask, int is_chroma) {
   aom_clear_system_state();
 
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
   const int rows =
-      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
+      ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+      ss_y;
   const int cols =
-      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *src = x->plane[0].src.buf;
+      ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+      ss_x;
+  const int src_stride = x->plane[plane].src.stride;
+  const uint8_t *src = x->plane[plane].src.buf;
   float hist[BINS] = { 0.0f };
   if (is_cur_buf_hbd(xd)) {
     generate_hog_hbd(src, src_stride, rows, cols, hist);
@@ -210,6 +216,10 @@
     generate_hog(src, src_stride, rows, cols, hist);
   }
 
+  for (int b = 0; b < BINS; ++b) {
+    hist[b] *= (1 + ss_x) * (1 + ss_y);
+  }
+
   for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
     float this_score = intra_hog_model_bias[i];
     const float *weights = &intra_hog_model_weights[i * BINS];
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index d3c0a77..c6b2d16 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3876,16 +3876,6 @@
   x->comp_rd_stats_idx = 0;
 }
 
-static AOM_INLINE void init_intra_mode_search_state(
-    IntraModeSearchState *intra_search_state) {
-  intra_search_state->skip_intra_modes = 0;
-  intra_search_state->best_intra_mode = DC_PRED;
-  intra_search_state->dir_mode_skip_mask_ready = 0;
-  av1_zero(intra_search_state->directional_mode_skip_mask);
-  intra_search_state->rate_uv_intra = INT_MAX;
-  av1_zero(intra_search_state->pmi_uv);
-}
-
 static AOM_INLINE void init_inter_mode_search_state(
     InterModeSearchState *search_state, const AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index fb27aca..ee1f890 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -460,6 +460,7 @@
     sf->inter_sf.skip_repeated_newmv = 1;
 
     sf->interp_sf.use_interp_filter = 1;
+
     sf->intra_sf.prune_palette_search_level = 1;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
@@ -557,6 +558,10 @@
     sf->inter_sf.txfm_rd_gate_level =
         boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
 
+    // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+    // inherited directly from luma hog with some minor tweaking. Eventually we
+    // should run this with a bayesian optimizer to find the Pareto frontier.
+    sf->intra_sf.chroma_intra_pruning_with_hog = 2;
     sf->intra_sf.intra_pruning_with_hog = 3;
     sf->intra_sf.prune_palette_search_level = 2;
 
@@ -666,6 +671,8 @@
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 5;
 
+    sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
     // TODO(any): Extend multi-winner mode processing support for inter frames
     sf->winner_mode_sf.multi_winner_mode_type =
         frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
@@ -690,6 +697,7 @@
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
     sf->inter_sf.prune_nearmv_using_neighbors = 1;
 
+    sf->intra_sf.chroma_intra_pruning_with_hog = 4;
     sf->intra_sf.intra_pruning_with_hog = 4;
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
@@ -1182,6 +1190,7 @@
 }
 
 static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->chroma_intra_pruning_with_hog = 0;
   intra_sf->skip_intra_in_interframe = 1;
   intra_sf->intra_pruning_with_hog = 0;
   intra_sf->src_var_thresh_intra_skip = 1;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 7aae39e..9f2a99b 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -806,8 +806,13 @@
   unsigned int src_var_thresh_intra_skip;
 
   // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to luma plane only.
   int intra_pruning_with_hog;
 
+  // Prune intra mode candidates based on source block histogram of gradient.
+  // Applies to chroma plane only.
+  int chroma_intra_pruning_with_hog;
+
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;