Refactor weight calculation code in apply_temporal_filter()
In apply_temporal_filter(), few of the variables related to
weight calculation were calculated for every pixel though
they are constant at sub-block level. In this CL, the
relevant calculations are done beforehand and the same are
consumed based on the sub-block index in both SSE2 and AVX2
variants of the function.
For 'good' encoding,
Instruction Count BD-Rate Loss(%)
cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim
5 LOWRES2 0.201 0.0000 0.0000 0.0000
5 MIDRES2 0.263 0.0000 0.0000 0.0000
5 HDRES2 0.321 0.0000 0.0000 0.0000
6 LOWRES2 0.115 0.0000 0.0000 0.0000
6 MIDRES2 0.243 0.0000 0.0000 0.0000
6 HDRES2 0.296 0.0000 0.0000 0.0000
Change-Id: I486418ff972940ac5863856a7364eb5374ed2b38
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index a9c80040..32b9d4d 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -192,20 +192,24 @@
}
}
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
for (int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame2[i * stride2 + j];
uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
scaled_error = AOMMIN(scaled_error, 7);
const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 8be7164..9bb7148 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -168,20 +168,24 @@
}
}
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
for (int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame2[i * stride2 + j];
uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
- const double block_error = (double)subblock_mses[subblock_idx];
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
const double combined_error =
- weight_factor * window_error + block_error * inv_factor;
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor;
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
scaled_error = AOMMIN(scaled_error, 7);
const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);