Speed-up weight calculation during highbd temporal filtering
In parent version, the weights used during highbd temporal
filtering were calculated using exp() function. In this CL,
exp() is replaced by approx_exp().
Instruction Count BD-Rate Loss(%)
cpu Testset Reduction(%) avg.psnr ovr.psnr ssim
5 LOWRES2 2.273 0.0175 0.0476 0.0476
5 MIDRES2 2.802 0.0673 0.0309 0.0309
5 HDRES2 3.333 0.0116 -0.0029 -0.0029
6 LOWRES2 1.218 0.0246 0.0398 0.0398
6 MIDRES2 2.438 0.0379 0.0366 0.0366
6 HDRES2 2.853 0.0406 0.0114 0.0114
STATS_CHANGED for highbd,speed=5,6
Change-Id: I3b68a8687697fbd6af634a21cf8a45be6cf94973
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ac44a5c..0a6db02 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,7 +405,7 @@
add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
}
}
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 95a6060..438fd0f 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1183,8 +1183,7 @@
}
if (speed >= 5) {
- // TODO(Ranjit): Enable the optimization for highbd encoding mode
- sf->hl_sf.weight_calc_level_in_tf = use_hbd ? 0 : 1;
+ sf->hl_sf.weight_calc_level_in_tf = 1;
sf->fp_sf.reduce_mv_step_param = 4;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 7ec4781..d58f326 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -725,11 +725,12 @@
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
const int num_planes, const double *noise_levels, const MV *subblock_mvs,
const int *subblock_mses, const int q_factor, const int filter_strength,
- const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
num_planes, noise_levels, subblock_mvs,
- subblock_mses, q_factor, filter_strength, 0, pred,
- accum, count);
+ subblock_mses, q_factor, filter_strength,
+ tf_wgt_calc_lvl, pred, accum, count);
}
#endif // CONFIG_AV1_HIGHBITDEPTH
/*!\brief Normalizes the accumulated filtering result to produce the filtered
@@ -818,6 +819,7 @@
const int mi_h = mi_size_high_log2[block_size];
const int mi_w = mi_size_wide_log2[block_size];
const int num_planes = av1_num_planes(&cpi->common);
+ const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf;
uint32_t *accum = tf_data->accum;
uint16_t *count = tf_data->count;
uint8_t *pred = tf_data->pred;
@@ -874,13 +876,13 @@
av1_highbd_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
- filter_strength, pred, accum, count);
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
} else {
#endif // CONFIG_AV1_HIGHBITDEPTH
av1_apply_temporal_filter_c(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
- filter_strength, 0, pred, accum, count);
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
#if CONFIG_AV1_HIGHBITDEPTH
}
#endif // CONFIG_AV1_HIGHBITDEPTH
@@ -889,14 +891,12 @@
av1_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
- filter_strength, cpi->sf.hl_sf.weight_calc_level_in_tf, pred,
- accum, count);
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
} else {
av1_apply_temporal_filter_c(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
- filter_strength, cpi->sf.hl_sf.weight_calc_level_in_tf, pred,
- accum, count);
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
}
}
}
diff --git a/av1/encoder/x86/highbd_temporal_filter_avx2.c b/av1/encoder/x86/highbd_temporal_filter_avx2.c
index 68509fa..42848d7 100644
--- a/av1/encoder/x86/highbd_temporal_filter_avx2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -13,6 +13,7 @@
#include <immintrin.h>
#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/temporal_filter.h"
@@ -147,7 +148,8 @@
const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
const double inv_num_ref_pixels, const double decay_factor,
- const double inv_factor, const double weight_factor, double *d_factor) {
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
assert(((block_width == 16) || (block_width == 32)) &&
((block_height == 16) || (block_height == 32)));
@@ -304,28 +306,57 @@
acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
}
- for (int i = 0, k = 0; i < block_height; i++) {
- for (int j = 0; j < block_width; j++, k++) {
- const int pixel_value = frame2[i * stride2 + j];
- uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
- // Scale down the difference for high bit depth input.
- diff_sse >>= ((bd - 8) * 2);
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
- const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
- const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
- scaled_error = AOMMIN(scaled_error, 7);
- const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
- count[k] += weight;
- accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
}
}
}
@@ -335,7 +366,8 @@
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
const int num_planes, const double *noise_levels, const MV *subblock_mvs,
const int *subblock_mses, const int q_factor, const int filter_strength,
- const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
@@ -424,7 +456,7 @@
ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
- weight_factor, d_factor);
+ weight_factor, d_factor, tf_wgt_calc_lvl);
plane_offset += plane_h * plane_w;
}
}
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index 1bfdaf7..f48b27b 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -13,6 +13,7 @@
#include <emmintrin.h>
#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/temporal_filter.h"
@@ -95,7 +96,8 @@
const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
const double inv_num_ref_pixels, const double decay_factor,
- const double inv_factor, const double weight_factor, double *d_factor) {
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
assert(((block_width == 16) || (block_width == 32)) &&
((block_height == 16) || (block_height == 32)));
@@ -179,28 +181,57 @@
}
}
- for (int i = 0, k = 0; i < block_height; i++) {
- for (int j = 0; j < block_width; j++, k++) {
- const int pixel_value = frame2[i * stride2 + j];
- uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
- // Scale down the difference for high bit depth input.
- diff_sse >>= ((bd - 8) * 2);
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
- const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
- const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
- scaled_error = AOMMIN(scaled_error, 7);
- const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
- count[k] += weight;
- accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
}
}
}
@@ -210,7 +241,8 @@
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
const int num_planes, const double *noise_levels, const MV *subblock_mvs,
const int *subblock_mses, const int q_factor, const int filter_strength,
- const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
@@ -299,7 +331,7 @@
ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
- weight_factor, d_factor);
+ weight_factor, d_factor, tf_wgt_calc_lvl);
plane_offset += plane_h * plane_w;
}
}
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 79d7e52..be92063 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -316,7 +316,7 @@
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
const int num_planes, const double *noise_level, const MV *subblock_mvs,
const int *subblock_mses, const int q_factor, const int filter_strenght,
- const uint8_t *pred, uint32_t *accum, uint16_t *count);
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
typedef libaom_test::FuncParam<HBDTemporalFilterFunc>
HBDTemporalFilterFuncParam;
@@ -328,6 +328,7 @@
virtual ~HBDTemporalFilterTest() {}
virtual void SetUp() {
params_ = GET_PARAM(0);
+ tf_wgt_calc_lvl_ = GET_PARAM(1);
rnd_.Reset(ACMRandom::DeterministicSeed());
src1_ = reinterpret_cast<uint16_t *>(
aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
@@ -389,6 +390,7 @@
protected:
HBDTemporalFilterFuncParam params_;
+ int tf_wgt_calc_lvl_;
uint16_t *src1_;
uint16_t *src2_;
ACMRandom rnd_;
@@ -477,20 +479,20 @@
params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
- filter_strength, CONVERT_TO_BYTEPTR(src2_),
- accumulator_ref, count_ref);
+ filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
num_planes, sigma, subblock_mvs, subblock_mses, q_factor,
- filter_strength, CONVERT_TO_BYTEPTR(src2_),
- accumulator_mod, count_mod);
+ filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
if (run_times > 1) {
aom_usec_timer_start(&ref_timer);
for (int j = 0; j < run_times; j++) {
params_.ref_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
num_planes, sigma, subblock_mvs, subblock_mses,
- q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
- accumulator_ref, count_ref);
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
}
aom_usec_timer_mark(&ref_timer);
const int elapsed_time_c =
@@ -500,8 +502,8 @@
for (int j = 0; j < run_times; j++) {
params_.tst_func(ref_frame.get(), mbd.get(), block_size, mb_row, mb_col,
num_planes, sigma, subblock_mvs, subblock_mses,
- q_factor, filter_strength, CONVERT_TO_BYTEPTR(src2_),
- accumulator_mod, count_mod);
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
}
aom_usec_timer_mark(&test_timer);
const int elapsed_time_simd =
@@ -558,7 +560,7 @@
};
INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
Combine(ValuesIn(HBDtemporal_filter_test_sse2),
- Range(64, 65, 4)));
+ Values(0, 1)));
#endif // HAVE_SSE2
#if HAVE_AVX2
HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
@@ -567,7 +569,7 @@
};
INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
Combine(ValuesIn(HBDtemporal_filter_test_avx2),
- Range(64, 65, 4)));
+ Values(0, 1)));
#endif // HAVE_AVX2
#endif // CONFIG_AV1_HIGHBITDEPTH
} // namespace