Implement new non-local means algorithm.
Apply the new non-local means filtering for 480p and above.
An experiment flag is set for easy maintainance and modification.
The most recent performance tests on 60 frames show:
Vbr midres: avg_psnr ovr_psnr ssim vmaf
Spd0 -0.133 -0.280 -0.144 -0.218
Spd1 -0.236 -0.377 -0.245 -0.440
Spd2 -0.273 -0.415 -0.299 -0.436
Vbr hdres: avg_psnr ovr_psnr ssim vmaf
Spd0 -0.235 -0.445 -0.302 -0.768
Spd1 -0.347 -0.565 -0.437 -0.870
Spd2 -0.416 -0.630 -0.488 -0.827
q midres: avg_psnr ovr_psnr ssim vmaf
Spd0 -0.077 -0.197 -0.118 -0.253
Spd1 -0.084 -0.196 -0.141 -0.262
Spd2 -0.107 -0.224 -0.107 -0.188
q hdres: avg_psnr ovr_psnr ssim vmaf
Spd0 0.107 -0.092 -0.123 -0.878
Spd1 0.003 -0.198 -0.219 -1.003
Spd2 -0.015 -0.216 -0.187 -0.901
Test head is 23bc1ee51f81334fef4189d1c6e3156a36407b5c.
STATS_CHANGED
Change-Id: Ia8cd2e31d75a3cfcef607f454ee873d812bde323
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 54127bc..edca3e2 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -34,6 +34,11 @@
#include "aom_ports/aom_timer.h"
#include "aom_scale/aom_scale.h"
+#define EXPERIMENT_TEMPORAL_FILTER 1
+#define WINDOW_LENGTH 2
+#define WINDOW_SIZE 25
+#define SCALE 1000
+
#define EDGE_THRESHOLD 50
#define SQRT_PI_BY_2 1.25331413732
@@ -144,7 +149,12 @@
unsigned int block_height,
int filter_weight, uint32_t *accumulator,
uint16_t *count) {
+#if EXPERIMENT_TEMPORAL_FILTER
+ (void)filter_weight;
+ const int modifier = SCALE;
+#else
const int modifier = filter_weight * 16;
+#endif // EXPERIMENT_TEMPORAL_FILTER
unsigned int i, j, k = 0;
assert(filter_weight == 2);
@@ -162,7 +172,12 @@
const uint8_t *pred8, int buf_stride, unsigned int block_width,
unsigned int block_height, int filter_weight, uint32_t *accumulator,
uint16_t *count) {
+#if EXPERIMENT_TEMPORAL_FILTER
+ (void)filter_weight;
+ const int modifier = SCALE;
+#else
const int modifier = filter_weight * 16;
+#endif // EXPERIMENT_TEMPORAL_FILTER
const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
unsigned int i, j, k = 0;
assert(filter_weight == 2);
@@ -655,6 +670,204 @@
}
}
+#if EXPERIMENT_TEMPORAL_FILTER
+void av1_temporal_filter_plane_c(uint8_t *frame1, unsigned int stride,
+ uint8_t *frame2, unsigned int stride2,
+ int block_width, int block_height,
+ int strength, double sigma, int decay_control,
+ const int *blk_fw, int use_32x32,
+ unsigned int *accumulator, uint16_t *count) {
+ (void)strength;
+ (void)blk_fw;
+ (void)use_32x32;
+ const double decay = decay_control * exp(1 - sigma);
+ const double h = decay * sigma;
+ const double beta = 1.0;
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+
+ int diff_sse = 0;
+ for (int idy = -WINDOW_LENGTH; idy <= WINDOW_LENGTH; ++idy) {
+ for (int idx = -WINDOW_LENGTH; idx <= WINDOW_LENGTH; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+ if (row < 0) row = 0;
+ if (row >= block_height) row = block_height - 1;
+ if (col < 0) col = 0;
+ if (col >= block_width) col = block_width - 1;
+
+ int diff = frame1[row * (int)stride + col] -
+ frame2[row * (int)stride2 + col];
+ diff_sse += diff * diff;
+ }
+ }
+ diff_sse /= WINDOW_SIZE;
+
+ double w = exp(-diff_sse / (2 * beta * h * h));
+ const int weight = (int)(w * SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+}
+
+void av1_highbd_temporal_filter_plane_c(
+ uint8_t *frame1_8bit, unsigned int stride, uint8_t *frame2_8bit,
+ unsigned int stride2, int block_width, int block_height, int strength,
+ double sigma, int decay_control, const int *blk_fw, int use_32x32,
+ unsigned int *accumulator, uint16_t *count) {
+ (void)strength;
+ (void)blk_fw;
+ (void)use_32x32;
+ uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8bit);
+ uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8bit);
+ const double decay = decay_control * exp(1 - sigma);
+ const double h = decay * sigma;
+ const double beta = 1.0;
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+
+ int diff_sse = 0;
+ for (int idy = -WINDOW_LENGTH; idy <= WINDOW_LENGTH; ++idy) {
+ for (int idx = -WINDOW_LENGTH; idx <= WINDOW_LENGTH; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+ if (row < 0) row = 0;
+ if (row >= block_height) row = block_height - 1;
+ if (col < 0) col = 0;
+ if (col >= block_width) col = block_width - 1;
+
+ int diff = frame1[row * (int)stride + col] -
+ frame2[row * (int)stride2 + col];
+ diff_sse += diff * diff;
+ }
+ }
+ diff_sse /= WINDOW_SIZE;
+
+ double w = exp(-diff_sse / (2 * beta * h * h));
+ const int weight = (int)(w * SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+}
+
+void apply_temporal_filter_block(YV12_BUFFER_CONFIG *frame, MACROBLOCKD *mbd,
+ int mb_y_src_offset, int mb_uv_src_offset,
+ int mb_uv_width, int mb_uv_height,
+ int num_planes, uint8_t *predictor,
+ int frame_height, int strength, double sigma,
+ int *blk_fw, int use_32x32,
+ unsigned int *accumulator, uint16_t *count) {
+ const int is_hbd = is_cur_buf_hbd(mbd);
+ // High bitdepth
+ if (is_hbd) {
+ if (frame_height >= 480) {
+ // Apply frame size dependent non-local means filtering.
+ int decay_control;
+ // The decay is obtained empirically, subject to better tuning.
+ if (frame_height >= 720) {
+ decay_control = 7;
+ } else if (frame_height >= 480) {
+ decay_control = 5;
+ } else {
+ decay_control = 3;
+ }
+ av1_highbd_temporal_filter_plane_c(frame->y_buffer + mb_y_src_offset,
+ frame->y_stride, predictor, BW, BW, BH,
+ strength, sigma, decay_control, blk_fw,
+ use_32x32, accumulator, count);
+ if (num_planes > 1) {
+ av1_highbd_temporal_filter_plane_c(
+ frame->u_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + BLK_PELS, mb_uv_width, mb_uv_width, mb_uv_height,
+ strength, sigma, decay_control, blk_fw, use_32x32,
+ accumulator + BLK_PELS, count + BLK_PELS);
+ av1_highbd_temporal_filter_plane_c(
+ frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_width, mb_uv_height,
+ strength, sigma, decay_control, blk_fw, use_32x32,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ }
+ } else {
+ // Apply original non-local means filtering for small resolution
+ const int adj_strength = strength + 2 * (mbd->bd - 8);
+ if (num_planes <= 1) {
+ // Single plane case
+ av1_highbd_temporal_filter_apply_c(
+ frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+ BH, adj_strength, blk_fw, use_32x32, accumulator, count);
+ } else {
+ // Process 3 planes together.
+ av1_highbd_apply_temporal_filter(
+ frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+ frame->u_buffer + mb_uv_src_offset,
+ frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + BLK_PELS, predictor + (BLK_PELS << 1), mb_uv_width, BW,
+ BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+ adj_strength, blk_fw, use_32x32, accumulator, count,
+ accumulator + BLK_PELS, count + BLK_PELS,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ }
+ }
+ return;
+ }
+
+ // Low bitdepth
+ if (frame_height >= 480) {
+ // Apply frame size dependent non-local means filtering.
+ int decay_control;
+ // The decay is obtained empirically, subject to better tuning.
+ if (frame_height >= 720) {
+ decay_control = 7;
+ } else if (frame_height >= 480) {
+ decay_control = 5;
+ } else {
+ decay_control = 3;
+ }
+ av1_temporal_filter_plane_c(frame->y_buffer + mb_y_src_offset,
+ frame->y_stride, predictor, BW, BW, BH,
+ strength, sigma, decay_control, blk_fw,
+ use_32x32, accumulator, count);
+ if (num_planes > 1) {
+ av1_temporal_filter_plane_c(
+ frame->u_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + BLK_PELS, mb_uv_width, mb_uv_width, mb_uv_height,
+ strength, sigma, decay_control, blk_fw, use_32x32,
+ accumulator + BLK_PELS, count + BLK_PELS);
+ av1_temporal_filter_plane_c(
+ frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_width, mb_uv_height,
+ strength, sigma, decay_control, blk_fw, use_32x32,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ }
+ } else {
+ // Apply original non-local means filtering for small resolution
+ if (num_planes <= 1) {
+ // Single plane case
+ av1_temporal_filter_apply_c(frame->y_buffer + mb_y_src_offset,
+ frame->y_stride, predictor, BW, BH, strength,
+ blk_fw, use_32x32, accumulator, count);
+ } else {
+ // Process 3 planes together.
+ av1_apply_temporal_filter(
+ frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+ frame->u_buffer + mb_uv_src_offset,
+ frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+ predictor + BLK_PELS, predictor + (BLK_PELS << 1), mb_uv_width, BW,
+ BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+ strength, blk_fw, use_32x32, accumulator, count,
+ accumulator + BLK_PELS, count + BLK_PELS,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ }
+ }
+}
+#endif // EXPERIMENT_TEMPORAL_FILTER
+
static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
@@ -779,7 +992,7 @@
static void temporal_filter_iterate_c(AV1_COMP *cpi,
YV12_BUFFER_CONFIG **frames,
int frame_count, int alt_ref_index,
- int strength,
+ int strength, double sigma,
struct scale_factors *ref_scale_factors) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -944,8 +1157,13 @@
}
} else {
if (is_hbd) {
+#if EXPERIMENT_TEMPORAL_FILTER
+ apply_temporal_filter_block(
+ f, mbd, mb_y_src_offset, mb_uv_src_offset, mb_uv_width,
+ mb_uv_height, num_planes, predictor, cm->height, strength,
+ sigma, blk_fw, use_32x32, accumulator, count);
+#else
const int adj_strength = strength + 2 * (mbd->bd - 8);
-
if (num_planes <= 1) {
// Single plane case
av1_highbd_temporal_filter_apply_c(
@@ -964,7 +1182,14 @@
count + BLK_PELS, accumulator + (BLK_PELS << 1),
count + (BLK_PELS << 1));
}
+#endif // EXPERIMENT_TEMPORAL_FILTER
} else {
+#if EXPERIMENT_TEMPORAL_FILTER
+ apply_temporal_filter_block(
+ f, mbd, mb_y_src_offset, mb_uv_src_offset, mb_uv_width,
+ mb_uv_height, num_planes, predictor, cm->height, strength,
+ sigma, blk_fw, use_32x32, accumulator, count);
+#else
if (num_planes <= 1) {
// Single plane case
av1_temporal_filter_apply_c(
@@ -983,6 +1208,7 @@
count + BLK_PELS, accumulator + (BLK_PELS << 1),
count + (BLK_PELS << 1));
}
+#endif // EXPERIMENT_TEMPORAL_FILTER
}
}
}
@@ -1159,7 +1385,8 @@
// Apply buffer limits and context specific adjustments to arnr filter.
static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
- int *arnr_frames, int *arnr_strength) {
+ int *arnr_frames, int *arnr_strength,
+ double *sigma) {
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
const int frames_after_arf =
av1_lookahead_depth(cpi->lookahead) - distance - 1;
@@ -1194,10 +1421,12 @@
noiselevel = highbd_estimate_noise(
buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
+ *sigma = noiselevel;
} else {
noiselevel = estimate_noise(buf->img.y_buffer, buf->img.y_crop_width,
buf->img.y_crop_height, buf->img.y_stride,
EDGE_THRESHOLD);
+ *sigma = noiselevel;
}
int adj_strength = oxcf->arnr_strength;
if (noiselevel > 0) {
@@ -1249,6 +1478,7 @@
YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
const GF_GROUP *const gf_group = &cpi->gf_group;
int rdmult = 0;
+ double sigma = 0;
// Apply context specific adjustments to the arnr filter parameters.
if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
@@ -1258,8 +1488,8 @@
strength = 0;
frames_to_blur = 1;
} else {
- adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur,
- &strength);
+ adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength,
+ &sigma);
}
int which_arf = gf_group->arf_update_idx[gf_group->index];
@@ -1299,5 +1529,5 @@
av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
temporal_filter_iterate_c(cpi, frames, frames_to_blur,
- frames_to_blur_backward, strength, &sf);
+ frames_to_blur_backward, strength, sigma, &sf);
}