Search for deblocking filter level for superblock

Search and pick filter level for each superblock after the entire
frame has been encoded. And then apply loop filtering using
selected filter levels.

Now it can't work with LOOPFILTER_LEVEL. Y, U, V planes share the
same filter level.

Per superblock, filter levels are signaled via bitstream.

Future work: move selecting filter level into each superblock
rather than wait for the whole frame encoded.

Change-Id: I46cb994ec4866abb0456146f7c3cae8813c1d52e
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index a8ec50c..5b49574 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -598,6 +598,10 @@
                                 const int dir_idx,
 #endif
                                 const MB_MODE_INFO *mbmi) {
+#if CONFIG_LPF_SB
+  return mbmi->filt_lvl;
+#endif
+
 #if CONFIG_SUPERTX
   const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
   assert(
@@ -643,6 +647,10 @@
 #else
 static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
                                 const MB_MODE_INFO *mbmi) {
+#if CONFIG_LPF_SB
+  return mbmi->filt_lvl;
+#endif
+
 #if CONFIG_SUPERTX
   const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
   assert(
@@ -3314,7 +3322,11 @@
 
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only) {
+                          int start, int stop,
+#if CONFIG_LPF_SB
+                          int col_start, int col_end,
+#endif
+                          int y_only) {
 #if CONFIG_LOOPFILTER_LEVEL
   // y_only no longer has its original meaning.
   // Here it means which plane to filter
@@ -3327,6 +3339,10 @@
   const int plane_start = 0;
   const int plane_end = num_planes;
 #endif  // CONFIG_LOOPFILTER_LEVEL
+#if !CONFIG_LPF_SB
+  const int col_start = 0;
+  const int col_end = cm->mi_cols;
+#endif  // CONFIG_LPF_SB
   int mi_row, mi_col;
   int plane;
 
@@ -3360,7 +3376,7 @@
 
   // filter all vertical edges in every 64x64 super block
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
       av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
       for (plane = plane_start; plane < plane_end; ++plane) {
         av1_filter_block_plane_vert(cm, plane, &planes[plane], mi_row, mi_col);
@@ -3370,7 +3386,7 @@
 
   // filter all horizontal edges in every 64x64 super block
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+    for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
       av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
       for (plane = plane_start; plane < plane_end; ++plane) {
         av1_filter_block_plane_horz(cm, plane, &planes[plane], mi_row, mi_col);
@@ -3453,7 +3469,12 @@
 #if CONFIG_LOOPFILTER_LEVEL
                            int frame_filter_level_r,
 #endif
-                           int y_only, int partial_frame) {
+                           int y_only, int partial_frame
+#if CONFIG_LPF_SB
+                           ,
+                           int mi_row, int mi_col
+#endif
+                           ) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 #if CONFIG_EXT_DELTA_Q
 #if CONFIG_LOOPFILTER_LEVEL
@@ -3463,11 +3484,46 @@
 #endif
 #endif
 
+#if CONFIG_LPF_SB
+  if (partial_frame && !frame_filter_level) return;
+#else
 #if CONFIG_LOOPFILTER_LEVEL
   if (!frame_filter_level && !frame_filter_level_r) return;
 #else
   if (!frame_filter_level) return;
 #endif
+#endif  // CONFIG_LPF_SB
+#if CONFIG_LPF_SB
+  int start_mi_col;
+  int end_mi_col;
+
+  // In the experiment of deblocking filtering per superblock.
+  // When partial_frame is 1, it indicates we are searching for the best filter
+  // level for current superblock. We reuse frame_filter_level as filter level
+  // for superblock, no longer for the whole frame.
+  // When partial_frame is 0, it's in the actual filtering stage for the frame
+  if (partial_frame) {
+    start_mi_row = mi_row;
+    end_mi_row = mi_row + cm->mib_size;
+    start_mi_col = mi_col;
+    end_mi_col = mi_col + cm->mib_size;
+    int row, col;
+    for (row = mi_row; row < mi_row + MAX_MIB_SIZE && row < cm->mi_rows;
+         ++row) {
+      for (col = mi_col; col < mi_col + MAX_MIB_SIZE && col < cm->mi_cols;
+           ++col) {
+        cm->mi_grid_visible[row * cm->mi_stride + col]->mbmi.filt_lvl =
+            frame_filter_level;
+      }
+    }
+  } else {
+    start_mi_row = 0;
+    mi_rows_to_filter = cm->mi_rows;
+    end_mi_row = start_mi_row + mi_rows_to_filter;
+    start_mi_col = 0;
+    end_mi_col = cm->mi_cols;
+  }
+#else
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_rows;
   if (partial_frame && cm->mi_rows > 8) {
@@ -3481,6 +3537,7 @@
 #else
   av1_loop_filter_frame_init(cm, frame_filter_level, frame_filter_level);
 #endif
+#endif  // CONFIG_LPF_SB
 
 #if CONFIG_EXT_DELTA_Q
 #if CONFIG_LOOPFILTER_LEVEL
@@ -3490,7 +3547,14 @@
   cm->lf.filter_level = frame_filter_level;
 #endif
 #endif
+
+#if CONFIG_LPF_SB
+  av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row,
+                       start_mi_col, end_mi_col, y_only);
+#else
   av1_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
+#endif  // CONFIG_LPF_SB
+
 #if CONFIG_EXT_DELTA_Q
 #if CONFIG_LOOPFILTER_LEVEL
   cm->lf.filter_level[0] = orig_filter_level[0];
@@ -3514,7 +3578,11 @@
 
 int av1_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
+#if !CONFIG_LPF_SB
   av1_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                        lf_data->start, lf_data->stop, lf_data->y_only);
+#else
+  (void)lf_data;
+#endif  // CONFIG_LPF_SB
   return 1;
 }
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 6701e97..10498c0 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -140,6 +140,19 @@
 void av1_loop_filter_frame_init(struct AV1Common *cm, int default_filt_lvl,
                                 int default_filt_lvl_r);
 
+#if CONFIG_LPF_SB
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int filter_level,
+                           int y_only, int partial_frame, int mi_row,
+                           int mi_col);
+
+// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+                          struct AV1Common *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
+                          int start, int stop, int col_start, int col_end,
+                          int y_only);
+#else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *mbd, int filter_level,
 #if CONFIG_LOOPFILTER_LEVEL
@@ -152,6 +165,7 @@
                           struct AV1Common *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only);
+#endif  // CONFIG_LPF_SB
 
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 5262d24..c381e3a 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -494,6 +494,9 @@
 #endif
 
   BOUNDARY_TYPE boundary_info;
+#if CONFIG_LPF_SB
+  uint8_t filt_lvl;
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index e538502..9ba4527 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -520,6 +520,9 @@
 #if CONFIG_LV_MAP
   LV_MAP_CTX_TABLE coeff_ctx_table;
 #endif
+#if CONFIG_LPF_SB
+  int final_lpf_encode;
+#endif
 } AV1_COMMON;
 
 #if CONFIG_REFERENCE_BUFFER
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0447d52..00501eb 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2530,6 +2530,22 @@
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_LPF_SB
+  if (bsize == cm->sb_size) {
+    int filt_lvl = aom_read_literal(r, 6, ACCT_STR);
+    int row, col;
+    // set filter level for each mbmi
+    for (row = mi_row; row < mi_row + MAX_MIB_SIZE && row < cm->mi_rows;
+         ++row) {
+      for (col = mi_col; col < mi_col + MAX_MIB_SIZE && col < cm->mi_cols;
+           ++col) {
+        cm->mi_grid_visible[row * cm->mi_stride + col]->mbmi.filt_lvl =
+            filt_lvl;
+      }
+    }
+  }
+#endif
+
 #if CONFIG_CDEF
   if (bsize == cm->sb_size) {
     int width_step = mi_size_wide[BLOCK_64X64];
@@ -2806,6 +2822,7 @@
 
 static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   struct loopfilter *lf = &cm->lf;
+#if !CONFIG_LPF_SB
 #if CONFIG_LOOPFILTER_LEVEL
   lf->filter_level[0] = aom_rb_read_literal(rb, 6);
   lf->filter_level[1] = aom_rb_read_literal(rb, 6);
@@ -2816,6 +2833,7 @@
 #else
   lf->filter_level = aom_rb_read_literal(rb, 6);
 #endif
+#endif  // CONFIG_LPF_SB
   lf->sharpness_level = aom_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
@@ -3899,6 +3917,10 @@
 
 #if CONFIG_VAR_TX || CONFIG_CB4X4
 // Loopfilter the whole frame.
+#if CONFIG_LPF_SB
+  av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
+                        cm->lf.filter_level, 0, 0, 0, 0);
+#else
 #if CONFIG_LOOPFILTER_LEVEL
   if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
     av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
@@ -3915,6 +3937,7 @@
     av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
                           cm->lf.filter_level, 0, 0);
 #endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
 #else
 #if CONFIG_PARALLEL_DEBLOCKING
   // Loopfilter all rows in the frame in the frame.
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 6df3f2d..88a31bb 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3147,6 +3147,15 @@
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_LPF_SB
+  // send filter level for each superblock (64x64)
+  if (bsize == cm->sb_size) {
+    aom_write_literal(
+        w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.filt_lvl,
+        6);
+  }
+#endif
+
 #if CONFIG_CDEF
   if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
     int width_step = mi_size_wide[BLOCK_64X64];
@@ -3411,6 +3420,7 @@
   struct loopfilter *lf = &cm->lf;
 
 // Encode the loop filter level and type
+#if !CONFIG_LPF_SB
 #if CONFIG_LOOPFILTER_LEVEL
   aom_wb_write_literal(wb, lf->filter_level[0], 6);
   aom_wb_write_literal(wb, lf->filter_level[1], 6);
@@ -3420,7 +3430,8 @@
   }
 #else
   aom_wb_write_literal(wb, lf->filter_level, 6);
-#endif
+#endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 0fcc8e6..a8d8989 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4425,13 +4425,19 @@
     cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
   }
 
+#if !CONFIG_LPF_SB
 #if CONFIG_LOOPFILTER_LEVEL
   if (lf->filter_level[0] || lf->filter_level[1])
 #else
   if (lf->filter_level > 0)
 #endif
+#endif  // CONFIG_LPF_SB
   {
 #if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+#if CONFIG_LPF_SB
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0, 0,
+                          0);
+#else
 #if CONFIG_LOOPFILTER_LEVEL
     av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level[0],
                           lf->filter_level[1], 0, 0);
@@ -4443,6 +4449,7 @@
 #else
     av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
 #else
     if (cpi->num_workers > 1)
       av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index a76f3a8..820633c 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -14,8 +14,8 @@
 
 #include "./aom_scale_rtcd.h"
 
-#include "aom_dsp/psnr.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
@@ -27,6 +27,53 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 
+#if CONFIG_LPF_SB
+#if CONFIG_HIGHBITDEPTH
+static int64_t compute_sb_y_sse_highbd(const YV12_BUFFER_CONFIG *src,
+                                       const YV12_BUFFER_CONFIG *frame,
+                                       int mi_row, int mi_col) {
+  int64_t sse = 0;
+  const int row = mi_row * MI_SIZE;
+  const int col = mi_col * MI_SIZE;
+  const uint16_t *src_y =
+      CONVERT_TO_SHORTPTR(src->y_buffer) + row * src->y_stride + col;
+  const uint16_t *frame_y =
+      CONVERT_TO_SHORTPTR(frame->y_buffer) + row * frame->y_stride + col;
+  int x, y;
+  for (y = 0; y < MAX_MIB_SIZE * MI_SIZE; ++y) {
+    for (x = 0; x < MAX_MIB_SIZE * MI_SIZE; ++x) {
+      const int diff = src_y[x] - frame_y[x];
+      sse += diff * diff;
+    }
+    src_y += src->y_stride;
+    frame_y += frame->y_stride;
+  }
+  return sse;
+}
+#endif
+
+static int64_t compute_sb_y_sse(const YV12_BUFFER_CONFIG *src,
+                                const YV12_BUFFER_CONFIG *frame, int mi_row,
+                                int mi_col) {
+  int64_t sse = 0;
+  const int row = mi_row * MI_SIZE;
+  const int col = mi_col * MI_SIZE;
+  const uint8_t *src_y = src->y_buffer + row * src->y_stride + col;
+  const uint8_t *frame_y = frame->y_buffer + row * frame->y_stride + col;
+  int x, y;
+  for (y = 0; y < MAX_MIB_SIZE * MI_SIZE; ++y) {
+    for (x = 0; x < MAX_MIB_SIZE * MI_SIZE; ++x) {
+      const int diff = src_y[x] - frame_y[x];
+      sse += diff * diff;
+    }
+    src_y += src->y_stride;
+    frame_y += frame->y_stride;
+  }
+  return sse;
+}
+#endif  // CONFIG_LPF_SB
+
+#if !CONFIG_LPF_SB
 static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
                             YV12_BUFFER_CONFIG *dst_bc, int plane) {
   switch (plane) {
@@ -36,6 +83,7 @@
     default: assert(plane >= 0 && plane <= 2); break;
   }
 }
+#endif  // CONFIG_LPF_SB
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
@@ -46,6 +94,137 @@
   }
 }
 
+#if CONFIG_LPF_SB
+// TODO(chengchen): reduce memory usage by copy superblock instead of frame
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                AV1_COMP *const cpi, int filt_level,
+                                int partial_frame, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
+                        partial_frame, mi_row, mi_col);
+#else
+  if (cpi->num_workers > 1)
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame, cpi->workers,
+                             cpi->num_workers, &cpi->lf_row_sync);
+  else
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = compute_sb_y_sse_highbd(sd, cm->frame_to_show, mi_row, mi_col);
+  } else {
+    filt_err = compute_sb_y_sse(sd, cm->frame_to_show, mi_row, mi_col);
+  }
+#else
+  filt_err = compute_sb_y_sse(sd, cm->frame_to_show, mi_row, mi_col);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame, double *best_cost_ret,
+                               int mi_row, int mi_col, int last_lvl) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = av1_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(last_lvl, min_filter_level, max_filter_level);
+  (void)lf;
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, mi_row, mi_col);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+    bias = 0;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] =
+            try_filter_frame(sd, cpi, filt_low, partial_frame, mi_row, mi_col);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] =
+            try_filter_frame(sd, cpi, filt_high, partial_frame, mi_row, mi_col);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+  return filt_best;
+}
+
+#else  // CONFIG_LPF_SB
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
                                 int partial_frame
@@ -220,6 +399,7 @@
   if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
+#endif  // CONFIG_LPF_SB
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
@@ -270,6 +450,25 @@
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
 #endif
   } else {
+#if CONFIG_LPF_SB
+    int mi_row, mi_col;
+    int last_lvl = 0;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        int lvl =
+            search_filter_level(sd, cpi, 1, NULL, mi_row, mi_col, last_lvl);
+        int row, col;
+        for (row = mi_row; row < mi_row + MAX_MIB_SIZE && row < cm->mi_rows;
+             ++row) {
+          for (col = mi_col; col < mi_col + MAX_MIB_SIZE && col < cm->mi_cols;
+               ++col) {
+            cm->mi_grid_visible[row * cm->mi_stride + col]->mbmi.filt_lvl = lvl;
+          }
+        }
+        last_lvl = lvl;
+      }
+    }
+#else
 #if CONFIG_LOOPFILTER_LEVEL
     lf->filter_level[0] = lf->filter_level[1] = search_filter_level(
         sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 2);
@@ -286,5 +485,6 @@
     lf->filter_level =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
 #endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
   }
 }