Consider motion search error in plane-wise filter.
In plane-wise temporal filtering strategy, filter weight is assigned
based on motion search. Currently, only a small neighborhood (i.e., a
5x5 window) is considered for each individual pixel. However, the motion
search result of the entire block also reflects the search accuracy.
This CL improves the plane-wise strategy by considering both local (5x5
window) and global (the entire filtering block) information to assign
filter weight. In particular, the window-wise error plays a more
important role than the block-wise error, but both of them are involved
in the filtering process.
NOTE: This CL only affects the performance on midres and hdres datasets.
Experimental results:
Under Speed-4 (two-pass mode):
avg PSNR ovr PSNR SSIM
midres -0.056 -0.015 -0.026
midres2 -0.029 0.000 -0.000
hdres -0.085 -0.076 -0.049
hdres2 -0.050 -0.028 -0.017
Under Speed-1 (two-pass mode):
avg PSNR ovr PSNR SSIM
midres -0.129 -0.033 -0.036
midres2 -0.074 -0.044 -0.026
hdres -0.106 -0.091 -0.098
hdres2 -0.070 -0.055 -0.057
STATS_CHANGED
Change-Id: Ie883a0105b1bdf4cd41b40b2e5be7f2f7178dd50
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3dfc6b5..f78bde4 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -289,7 +289,7 @@
}
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
- add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const uint8_t *pred, uint32_t *accum, uint16_t *count";
specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
}
add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f827356..4592813 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -642,6 +642,9 @@
// num_planes: Number of planes in the frame.
// noise_levels: Pointer to the noise levels of the to-filter frame, estimated
// with each plane (in Y, U, V order).
+// use_subblock: Whether to use 4 sub-blocks to replace the original block.
+// block_mse: Motion search error (MSE) for the entire block.
+// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
// pred: Pointer to the well-built predictors.
// accum: Pointer to the pixel-wise accumulator for filtering.
// count: Pointer to the pixel-wise counter fot filtering.
@@ -651,7 +654,8 @@
void av1_apply_temporal_filter_planewise_c(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
- const int num_planes, const double *noise_levels, const uint8_t *pred,
+ const int num_planes, const double *noise_levels, const int use_subblock,
+ const int block_mse, const int *subblock_mses, const uint8_t *pred,
uint32_t *accum, uint16_t *count) {
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -733,18 +737,25 @@
}
}
+ // Scale down the difference for high bit depth input.
+ if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
+ const double window_error = (double)(sum_square_diff) / num_ref_pixels;
+ const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+ const double block_error =
+ (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
// Control factor for non-local mean approach.
const double r =
(double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
- const int idx = plane_offset + pred_idx; // Index with plane shift.
- const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
- // Scale down the difference for high bit depth input.
- if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
- const double scaled_diff = AOMMAX(
- -(double)(sum_square_diff / num_ref_pixels) / (2 * r * r), -15.0);
+ // Compute filter weight.
+ const double scaled_diff =
+ AOMMAX(-(window_error + block_error / 10) / (2 * r * r), -15.0);
const int adjusted_weight =
(int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+ const int idx = plane_offset + pred_idx; // Index with plane shift.
+ const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
accum[idx] += adjusted_weight * pred_value;
count[idx] += adjusted_weight;
@@ -779,6 +790,8 @@
// noise_levels: Pointer to the noise levels of the to-filter frame, estimated
// with each plane (in Y, U, V order). (Used in plane-wise
// strategy)
+// block_mse: Motion search error (MSE) for the entire block.
+// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
// pred: Pointer to the well-built predictors.
// accum: Pointer to the pixel-wise accumulator for filtering.
// count: Pointer to the pixel-wise counter fot filtering.
@@ -790,20 +803,22 @@
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
const int num_planes, const int use_planewise_strategy, const int strength,
const int use_subblock, const int *subblock_filter_weights,
- const double *noise_levels, const uint8_t *pred, uint32_t *accum,
- uint16_t *count) {
+ const double *noise_levels, const int block_mse, const int *subblock_mses,
+ const uint8_t *pred, uint32_t *accum, uint16_t *count) {
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
if (use_planewise_strategy) { // Commonly used for high-resolution video.
// TODO(any): avx2 and sse2 version should also support high bit-depth.
if (is_frame_high_bitdepth(frame_to_filter)) {
- av1_apply_temporal_filter_planewise_c(frame_to_filter, mbd, block_size,
- mb_row, mb_col, num_planes,
- noise_levels, pred, accum, count);
+ av1_apply_temporal_filter_planewise_c(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, use_subblock, block_mse, subblock_mses, pred, accum,
+ count);
} else {
av1_apply_temporal_filter_planewise(frame_to_filter, mbd, block_size,
mb_row, mb_col, num_planes,
- noise_levels, pred, accum, count);
+ noise_levels, use_subblock, block_mse,
+ subblock_mses, pred, accum, count);
}
} else { // Commonly used for low-resolution video.
if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
@@ -1014,7 +1029,8 @@
av1_apply_temporal_filter_others(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
use_planewise_strategy, strength, use_subblock,
- subblock_filter_weights, noise_levels, pred, accum, count);
+ subblock_filter_weights, noise_levels, block_mse, subblock_mses,
+ pred, accum, count);
}
}
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 1d10179..07e14f7 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -50,7 +50,7 @@
vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
_mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
- // Set zero to unitialized memory to avoid uninitialized loads later
+ // Set zero to uninitialized memory to avoid uninitialized loads later
*(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
src1 += stride, src2 += stride2;
@@ -84,7 +84,7 @@
vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
_mm256_storeu_si256((__m256i *)(dst), vres1);
_mm256_storeu_si256((__m256i *)(dst + 16), vres2);
- // Set zero to unitialized memory to avoid uninitialized loads later
+ // Set zero to uninitialized memory to avoid uninitialized loads later
*(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
src1 += stride;
@@ -130,7 +130,8 @@
static void apply_temporal_filter_planewise(
const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
const unsigned int stride2, const int block_width, const int block_height,
- const double sigma, const int decay_control, unsigned int *accumulator,
+ const double sigma, const int decay_control, const int use_subblock,
+ const int block_mse, const int *subblock_mses, unsigned int *accumulator,
uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
int plane, int ss_x_shift, int ss_y_shift) {
assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
@@ -217,8 +218,15 @@
}
}
}
+
+ const double window_error = (double)(diff_sse) / num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error =
+ (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
const double scaled_diff =
- AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+ AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
const int adjusted_weight =
(int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
@@ -231,7 +239,8 @@
void av1_apply_temporal_filter_planewise_avx2(
const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
- const int num_planes, const double *noise_levels, const uint8_t *pred,
+ const int num_planes, const double *noise_levels, const int use_subblock,
+ const int block_mse, const int *subblock_mses, const uint8_t *pred,
uint32_t *accum, uint16_t *count) {
const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
if (is_high_bitdepth) {
@@ -265,9 +274,9 @@
apply_temporal_filter_planewise(
ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
- noise_levels[plane], decay_control, accum + mb_pels * plane,
- count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
- ss_x_shift, ss_y_shift);
+ noise_levels[plane], decay_control, use_subblock, block_mse,
+ subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
+ luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
}
if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
}
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 5d8e5e6..4fc8738 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -41,7 +41,7 @@
for (int i = 0; i < block_height; i++) {
for (int j = 0; j < block_width; j += 16) {
- // Set zero to unitialized memory to avoid uninitialized loads later
+ // Set zero to uninitialized memory to avoid uninitialized loads later
*(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
__m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
@@ -62,7 +62,7 @@
_mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
}
- // Set zero to unitialized memory to avoid uninitialized loads later
+ // Set zero to uninitialized memory to avoid uninitialized loads later
*(uint32_t *)(dst + block_width + 2) =
_mm_cvtsi128_si32(_mm_setzero_si128());
@@ -105,7 +105,8 @@
static void apply_temporal_filter_planewise(
const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
const unsigned int stride2, const int block_width, const int block_height,
- const double sigma, const int decay_control, unsigned int *accumulator,
+ const double sigma, const int decay_control, const int use_subblock,
+ const int block_mse, const int *subblock_mses, unsigned int *accumulator,
uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
int plane, int ss_x_shift, int ss_y_shift) {
assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
@@ -196,8 +197,14 @@
}
}
+ const double window_error = (double)(diff_sse) / num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error =
+ (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
const double scaled_diff =
- AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+ AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
const int adjusted_weight =
(int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
@@ -210,7 +217,8 @@
void av1_apply_temporal_filter_planewise_sse2(
const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
- const int num_planes, const double *noise_levels, const uint8_t *pred,
+ const int num_planes, const double *noise_levels, const int use_subblock,
+ const int block_mse, const int *subblock_mses, const uint8_t *pred,
uint32_t *accum, uint16_t *count) {
const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
if (is_high_bitdepth) {
@@ -244,9 +252,9 @@
apply_temporal_filter_planewise(
ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
- noise_levels[plane], decay_control, accum + mb_pels * plane,
- count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
- ss_x_shift, ss_y_shift);
+ noise_levels[plane], decay_control, use_subblock, block_mse,
+ subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
+ luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
}
if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
}
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_planewise_test.cc
index 3c44600..b19ec29 100644
--- a/test/temporal_filter_planewise_test.cc
+++ b/test/temporal_filter_planewise_test.cc
@@ -40,7 +40,8 @@
typedef void (*TemporalFilterPlanewiseFunc)(
const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
- const int num_planes, const double *noise_level, const uint8_t *pred,
+ const int num_planes, const double *noise_level, const int use_subblock,
+ const int block_mse, const int *subblock_mses, const uint8_t *pred,
uint32_t *accum, uint16_t *count);
typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
TemporalFilterPlanewiseFuncParam;
@@ -124,6 +125,9 @@
assert(width == 32 && height == 32);
const BLOCK_SIZE block_size = BLOCK_32X32;
+ const int use_subblock = 0;
+ const int block_mse = 0;
+ const int subblock_mses[4] = { 0, 0, 0, 0 };
const int mb_row = 0;
const int mb_col = 0;
const int num_planes = 1;
@@ -143,15 +147,18 @@
mbd->bd = 8;
params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
- sigma, src2_, accumulator_ref, count_ref);
+ sigma, use_subblock, block_mse, subblock_mses, src2_,
+ accumulator_ref, count_ref);
params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
- sigma, src2_, accumulator_mod, count_mod);
+ sigma, use_subblock, block_mse, subblock_mses, src2_,
+ accumulator_mod, count_mod);
if (run_times > 1) {
aom_usec_timer_start(&ref_timer);
for (int j = 0; j < run_times; j++) {
params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
- sigma, src2_, accumulator_ref, count_ref);
+ sigma, use_subblock, block_mse, subblock_mses, src2_,
+ accumulator_ref, count_ref);
}
aom_usec_timer_mark(&ref_timer);
const int elapsed_time_c =
@@ -160,7 +167,8 @@
aom_usec_timer_start(&test_timer);
for (int j = 0; j < run_times; j++) {
params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
- sigma, src2_, accumulator_mod, count_mod);
+ sigma, use_subblock, block_mse, subblock_mses, src2_,
+ accumulator_mod, count_mod);
}
aom_usec_timer_mark(&test_timer);
const int elapsed_time_simd =