Prune Wiener search

Wiener filter search is bypassed based on source variance and
reconstruction error, for speed presets 3 and 4.

STATS_CHANGED

Change-Id: Ie668cf58cfe298c6d3fc052a07052e3abfc258c4
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 1680d5f..6006367 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -613,6 +613,10 @@
 
     add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
     specialize qw/aom_sum_squares_i16 sse2/;
+
+	add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+
+    add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
   }
 
   #
diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index 35f8fde..370bd75 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c
@@ -176,6 +176,27 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8_c(a->y_buffer + vstart * a->y_stride + hstart,
+                         a->y_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8_c(a->u_buffer + vstart * a->uv_stride + hstart,
+                         a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8_c(a->v_buffer + vstart * a->uv_stride + hstart,
+                         a->uv_stride, width, height) /
+         (width * height);
+}
+
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
                            int vstart, int height) {
@@ -228,6 +249,27 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16_c(a->y_buffer + vstart * a->y_stride + hstart,
+                          a->y_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16_c(a->u_buffer + vstart * a->uv_stride + hstart,
+                          a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16_c(a->v_buffer + vstart * a->uv_stride + hstart,
+                          a->uv_stride, width, height) /
+         (width * height);
+}
+
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height) {
diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index 99aa54c..7f40b8b 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h
@@ -35,6 +35,12 @@
  * \param[in]    sse           Sum of squared errors
  */
 double aom_sse_to_psnr(double samples, double peak, double sse);
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
                            int vstart, int height);
@@ -50,6 +56,12 @@
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd);
 #if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height);
diff --git a/aom_dsp/sum_squares.c b/aom_dsp/sum_squares.c
index 44ec41f..d739a60 100644
--- a/aom_dsp/sum_squares.c
+++ b/aom_dsp/sum_squares.c
@@ -38,3 +38,36 @@
 
   return ss;
 }
+
+uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) {
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint8_t v = src[c];
+      ss += v * v;
+      s += v;
+    }
+    src += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint16_t v = srcp[c];
+      ss += v * v;
+      s += v;
+    }
+    srcp += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 7b9d874..46b697c 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -63,6 +63,9 @@
                                            const YV12_BUFFER_CONFIG *b,
                                            int hstart, int width, int vstart,
                                            int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                            int hstart, int width, int vstart,
+                                            int height);
 
 #if CONFIG_AV1_HIGHBITDEPTH
 #define NUM_EXTRACTORS (3 * (1 + 1))
@@ -71,11 +74,18 @@
   aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
   aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
 };
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_var,        aom_get_u_var,        aom_get_v_var,
+  aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+};
 #else
 #define NUM_EXTRACTORS 3
 static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
   aom_get_y_sse_part, aom_get_u_sse_part, aom_get_v_sse_part
 };
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_var, aom_get_u_var, aom_get_v_var
+};
 #endif
 
 static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
@@ -87,6 +97,14 @@
       limits->v_start, limits->v_end - limits->v_start);
 }
 
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+                                     const YV12_BUFFER_CONFIG *src, int plane,
+                                     int highbd) {
+  return var_part_extractors[3 * highbd + plane](
+      src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+      limits->v_end - limits->v_start);
+}
+
 typedef struct {
   // The best coefficients for Wiener or Sgrproj restoration
   WienerInfo wiener;
@@ -1440,6 +1458,36 @@
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
+
+  // Skip Wiener search for low variance contents
+  if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) {
+    const int scale[3] = { 0, 1, 2 };
+    // Obtain the normalized Qscale
+    const int qs = av1_dc_quant_QTX(rsc->cm->base_qindex, 0,
+                                    rsc->cm->seq_params.bit_depth) >>
+                   3;
+    // Derive threshold as sqr(normalized Qscale) * scale / 16,
+    const uint64_t thresh =
+        (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4;
+    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+    const uint64_t src_var =
+        var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+    // Do not perform Wiener search if source variance is lower than threshold
+    // or if the reconstruction error is zero
+    int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
+    if (prune_wiener) {
+      rsc->bits += bits_none;
+      rsc->sse += rusi->sse[RESTORE_NONE];
+      rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+      rusi->sse[RESTORE_WIENER] = INT64_MAX;
+      if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2)
+        rusi->skip_sgr_eval = 1;
+      return;
+    }
+  }
+
   const int wiener_win =
       (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
 
@@ -1470,8 +1518,6 @@
                     limits->h_start, limits->h_end, limits->v_start,
                     limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
 #endif
-  const MACROBLOCK *const x = rsc->x;
-  const int64_t bits_none = x->wiener_restore_cost[0];
 
   if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
     rsc->bits += bits_none;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 52a6a8a..6424f30 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -456,6 +456,7 @@
             : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1
                                                                          : 2;
 
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
     sf->lpf_sf.prune_sgr_based_on_wiener =
         cm->allow_screen_content_tools ? 0 : 2;
     sf->lpf_sf.reduce_wiener_window_size = is_boosted_arf2_bwd_type ? 0 : 1;
@@ -507,6 +508,7 @@
     sf->lpf_sf.disable_loop_restoration_chroma =
         (boosted || cm->allow_screen_content_tools) ? 0 : 1;
     sf->lpf_sf.reduce_wiener_window_size = !boosted;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
 
     // TODO(any): The following features have no impact on quality and speed,
     // and are disabled.
@@ -1034,6 +1036,7 @@
 
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->prune_wiener_based_on_src_var = 0;
   lpf_sf->prune_sgr_based_on_wiener = 0;
   lpf_sf->enable_sgr_ep_pruning = 0;
   lpf_sf->reduce_wiener_window_size = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index f5e7677..2e896a0 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -825,6 +825,12 @@
   // Disable loop restoration for Chroma plane
   int disable_loop_restoration_chroma;
 
+  // Prune RESTORE_WIENER evaluation based on source variance
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_wiener_based_on_src_var;
+
   // Prune self-guided loop restoration based on wiener search results
   // 0 : no pruning
   // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE