Enable Loop restoration with Wiener filter for speed 5 and 6

In the parent version, Loop restoration (i.e., Wiener and
Self-guided) is disabled for speed 5 and 6. This CL modifies
the sf 'disable_lr_filter' to enable Wiener Loop restoration
filter alone for speed 5 and 6. Also, introduced a speed
feature to disable refinement stage around wiener filter
coefficients as it does not have much impact on quality
w.r.t. speed.

For 'good' encoding mode,

          Instruction Count       BD-Rate Loss(%)
cpu-used    Reduction(%)     avg.psnr  ovr.psnr   ssim
   5          -0.668        -0.5327   -0.5322    -0.5161
   6          -0.926        -0.5467   -0.5441    -0.5223

STATS_CHANGED

Change-Id: I82bc416c3532f263bf837259c4b72d7fe622f176
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 8117caf..6e95f70 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -289,12 +289,12 @@
 }
 
 // Assumes cm->rst_info[p].restoration_unit_size is already initialized
-void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
+void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
   const int num_planes = av1_num_planes(cm);
   for (int p = 0; p < num_planes; ++p)
     av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
 
-  if (cm->rst_tmpbuf == NULL) {
+  if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
     CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
                     (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
   }
diff --git a/av1/common/alloccommon.h b/av1/common/alloccommon.h
index fc4a8ba..d31b4c5 100644
--- a/av1/common/alloccommon.h
+++ b/av1/common/alloccommon.h
@@ -14,6 +14,8 @@
 
 #define INVALID_IDX -1  // Invalid buffer index.
 
+#include <stdbool.h>
+
 #include "config/aom_config.h"
 
 #include "av1/common/enums.h"
@@ -48,7 +50,7 @@
 void av1_free_cdef_buffers(struct AV1Common *const cm,
                            struct AV1CdefWorker **cdef_worker,
                            struct AV1CdefSyncData *cdef_sync);
-void av1_alloc_restoration_buffers(struct AV1Common *cm);
+void av1_alloc_restoration_buffers(struct AV1Common *cm, bool is_sgr_enabled);
 void av1_free_restoration_buffers(struct AV1Common *cm);
 
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d273c79..5b76de8 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -10,6 +10,7 @@
  */
 
 #include <assert.h>
+#include <stdbool.h>
 #include <stddef.h>
 
 #include "config/aom_config.h"
@@ -5217,7 +5218,7 @@
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_alloc_restoration_buffers(cm);
+    av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
   }
 
   const int use_highbd = cm->seq_params->use_highbitdepth;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f183e15..3493de8 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2253,7 +2253,8 @@
     for (int i = 0; i < num_planes; ++i)
       cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
-    av1_alloc_restoration_buffers(cm);
+    const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter;
+    av1_alloc_restoration_buffers(cm, is_sgr_enabled);
     // Store the allocated restoration buffers in MT object.
     if (cpi->ppi->p_mt_info.num_workers > 1) {
       av1_init_lr_mt_buffers(cpi);
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index c558ee6..f2fc6ab 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -32,10 +32,6 @@
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 
-// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
-// When set to RESTORE_TYPES we allow switchable.
-static const RestorationType force_restore_type = RESTORE_TYPES;
-
 // Number of Wiener iterations
 #define NUM_WIENER_ITERS 5
 
@@ -1459,7 +1455,6 @@
   return bits;
 }
 
-#define USE_WIENER_REFINEMENT_SEARCH 1
 static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
                                         const RestorationTileLimits *limits,
                                         const PixelRect *tile,
@@ -1467,7 +1462,10 @@
                                         int wiener_win) {
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
   int64_t err = try_restoration_unit(rsc, limits, tile, rui);
-#if USE_WIENER_REFINEMENT_SEARCH
+
+  if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
+
+  // Refinement search around the wiener filter coefficients.
   int64_t err2;
   int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
                     WIENER_FILT_TAP2_MINV };
@@ -1563,7 +1561,6 @@
     }
   }
   // printf("err post = %"PRId64"\n", err);
-#endif  // USE_WIENER_REFINEMENT_SEARCH
   return err;
 }
 
@@ -1818,6 +1815,24 @@
   return rsi->units_per_tile;
 }
 
+static INLINE void av1_derive_flags_for_lr_processing(
+    const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) {
+  const bool is_wiener_disabled = lpf_sf->disable_wiener_filter;
+  const bool is_sgr_disabled = lpf_sf->disable_sgr_filter;
+
+  // Enable None Loop restoration filter if either of Wiener or Self-guided is
+  // enabled.
+  disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled);
+
+  disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled;
+  disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled;
+
+  // Enable Swicthable Loop restoration filter if both of the Wiener and
+  // Self-guided are enabled.
+  disable_lr_filter[RESTORE_SWITCHABLE] =
+      (is_wiener_disabled || is_sgr_disabled);
+}
+
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -1858,9 +1873,6 @@
 
   RestSearchCtxt rsc;
 
-  // TODO(Diksha): The buffers allocated below are used during Wiener filter
-  // processing. Hence, allocate the same when Wiener filter is enabled.
-  //
   // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
   // These buffers are required for AVX2 SIMD purpose only. Hence, allocated the
   // same if AVX2 variant of SIMD for av1_compute_stats() is enabled. The buffer
@@ -1871,27 +1883,39 @@
   rsc.dgd_avg = NULL;
   rsc.src_avg = NULL;
 #if HAVE_AVX2
-  int16_t *buf;
-  const int buf_size =
-      sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
-  CHECK_MEM_ERROR(cm, buf, (int16_t *)aom_memalign(32, buf_size));
+  // The buffers allocated below are used during Wiener filter processing of low
+  // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
+  // low bitdepth path.
+  if (!cpi->sf.lpf_sf.disable_wiener_filter &&
+      !cm->seq_params->use_highbitdepth) {
+    const int buf_size = sizeof(*rsc.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX *
+                         RESTORATION_UNITSIZE_MAX;
+    CHECK_MEM_ERROR(cm, rsc.dgd_avg, (int16_t *)aom_memalign(32, buf_size));
 
-  // When LRU width isn't multiple of 16, the 256 bits load instruction used in
-  // AVX2 intrinsic can read data beyond valid LRU. Hence, in order to silence
-  // Valgrind warning this buffer is initialized with zero. Overhead due to this
-  // initialization is negligible since it is done at frame level.
-  memset(buf, 0, buf_size);
-  rsc.dgd_avg = buf;
-  rsc.src_avg = buf + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
-  // Asserts the starting address of src_avg is always 32-bytes aligned.
-  assert(!((intptr_t)rsc.src_avg % 32));
+    // When LRU width isn't multiple of 16, the 256 bits load instruction used
+    // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
+    // silence Valgrind warning this buffer is initialized with zero. Overhead
+    // due to this initialization is negligible since it is done at frame level.
+    memset(rsc.dgd_avg, 0, buf_size);
+    rsc.src_avg =
+        rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+    // Asserts the starting address of src_avg is always 32-bytes aligned.
+    assert(!((intptr_t)rsc.src_avg % 32));
+  }
 #endif
 
   const int plane_start = AOM_PLANE_Y;
   const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+
+  // Derive the flags to enable/disable Loop restoration filters based on the
+  // speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
+  bool disable_lr_filter[RESTORE_TYPES] = { false };
+  const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+  av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
+
   for (int plane = plane_start; plane <= plane_end; ++plane) {
-    init_rsc(src, &cpi->common, x, &cpi->sf.lpf_sf, plane, rusi,
-             &cpi->trial_frame_rst, &rsc);
+    init_rsc(src, &cpi->common, x, lpf_sf, plane, rusi, &cpi->trial_frame_rst,
+             &rsc);
 
     const int plane_ntiles = ntiles[plane > 0];
     const RestorationType num_rtypes =
@@ -1901,16 +1925,16 @@
     RestorationType best_rtype = RESTORE_NONE;
 
     const int highbd = rsc.cm->seq_params->use_highbitdepth;
-    if ((plane && !cpi->sf.lpf_sf.disable_loop_restoration_chroma) ||
-        (!plane && !cpi->sf.lpf_sf.disable_loop_restoration_luma)) {
+    if ((plane && !lpf_sf->disable_loop_restoration_chroma) ||
+        (!plane && !lpf_sf->disable_loop_restoration_luma)) {
       av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
                        rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
                        highbd);
 
       for (RestorationType r = 0; r < num_rtypes; ++r) {
-        if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
-            (r != force_restore_type))
-          continue;
+        // Disable Loop restoration filter based on the flags set using speed
+        // feature 'disable_wiener_filter' and 'disable_sgr_filter'.
+        if (disable_lr_filter[r]) continue;
 
         double cost = search_rest_type(&rsc, r);
 
@@ -1922,9 +1946,6 @@
     }
 
     cm->rst_info[plane].frame_restoration_type = best_rtype;
-    if (force_restore_type != RESTORE_TYPES)
-      assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
-
     if (best_rtype != RESTORE_NONE) {
       for (int u = 0; u < plane_ntiles; ++u) {
         copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
@@ -1932,7 +1953,10 @@
     }
   }
 #if HAVE_AVX2
-  aom_free(buf);
+  if (!cpi->sf.lpf_sf.disable_wiener_filter &&
+      !cm->seq_params->use_highbitdepth) {
+    aom_free(rsc.dgd_avg);
+  }
 #endif
   aom_free(rusi);
 }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index e05452a..1447f24 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -484,7 +484,9 @@
     sf->intra_sf.chroma_intra_pruning_with_hog = 3;
 
     sf->lpf_sf.use_coarse_filter_level_search = 0;
-    sf->lpf_sf.disable_lr_filter = 1;
+    // Disable Wiener and Self-guided Loop restoration filters.
+    sf->lpf_sf.disable_wiener_filter = true;
+    sf->lpf_sf.disable_sgr_filter = true;
 
     sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
 
@@ -1211,7 +1213,9 @@
         frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
                                           : MULTI_WINNER_MODE_OFF;
 
-    sf->lpf_sf.disable_lr_filter = 1;
+    // Disable Self-guided Loop restoration filter.
+    sf->lpf_sf.disable_sgr_filter = true;
+    sf->lpf_sf.disable_wiener_coeff_refine_search = true;
 
     sf->tpl_sf.prune_starting_mv = 3;
     sf->tpl_sf.use_y_only_rate_distortion = 1;
@@ -1580,7 +1584,9 @@
   sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
   sf->tx_sf.model_based_prune_tx_search_level = 0;
   sf->lpf_sf.dual_sgr_penalty_level = 1;
-  sf->lpf_sf.disable_lr_filter = 1;
+  // Disable Wiener and Self-guided Loop restoration filters.
+  sf->lpf_sf.disable_wiener_filter = true;
+  sf->lpf_sf.disable_sgr_filter = true;
   sf->rt_sf.skip_interp_filter_search = 1;
   sf->intra_sf.prune_palette_search_level = 2;
   sf->intra_sf.prune_luma_palette_size_search_level = 2;
@@ -2110,7 +2116,10 @@
   lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
   // Set decoder side speed feature to use less dual sgr modes
   lpf_sf->dual_sgr_penalty_level = 0;
-  lpf_sf->disable_lr_filter = 0;
+  // Enable Wiener and Self-guided Loop restoration filters by default.
+  lpf_sf->disable_wiener_filter = false;
+  lpf_sf->disable_sgr_filter = false;
+  lpf_sf->disable_wiener_coeff_refine_search = false;
   lpf_sf->use_downsampled_wiener_stats = 0;
 }
 
@@ -2291,7 +2300,10 @@
         (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
     cpi->common.seq_params->enable_dual_filter &=
         !sf->interp_sf.disable_dual_filter;
-    cpi->common.seq_params->enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+    // Set the flag 'enable_restoration', if one the Loop restoration filters
+    // (i.e., Wiener or Self-guided) is enabled.
+    cpi->common.seq_params->enable_restoration &=
+        (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter);
 
     cpi->common.seq_params->enable_interintra_compound &=
         (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e4c7c70..13b8903 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1432,8 +1432,14 @@
   // Reduce the wiener filter win size for luma
   int reduce_wiener_window_size;
 
-  // Disable loop restoration filter
-  int disable_lr_filter;
+  // Flag to disable Wiener Loop restoration filter.
+  bool disable_wiener_filter;
+
+  // Flag to disable Self-guided Loop restoration filter.
+  bool disable_sgr_filter;
+
+  // Disable the refinement search around the wiener filter coefficients.
+  bool disable_wiener_coeff_refine_search;
 
   // Whether to downsample the rows in computation of wiener stats.
   int use_downsampled_wiener_stats;