Refactor weight calculation code in apply_temporal_filter() In apply_temporal_filter(), few of the variables related to weight calculation were calculated for every pixel though they are constant at sub-block level. In this CL, the relevant calculations are done beforehand and the same are consumed based on the sub-block index in both SSE2 and AVX2 variants of the function. For 'good' encoding, Instruction Count BD-Rate Loss(%) cpu Resolution Reduction(%) avg.psnr ovr.psnr ssim 5 LOWRES2 0.201 0.0000 0.0000 0.0000 5 MIDRES2 0.263 0.0000 0.0000 0.0000 5 HDRES2 0.321 0.0000 0.0000 0.0000 6 LOWRES2 0.115 0.0000 0.0000 0.0000 6 MIDRES2 0.243 0.0000 0.0000 0.0000 6 HDRES2 0.296 0.0000 0.0000 0.0000 Change-Id: I486418ff972940ac5863856a7364eb5374ed2b38

commit: 5ad2a273a60bd6fe8c453997ac2c8b6497b1be1c [log] [tgz]
author: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com> Fri Jan 06 17:20:42 2023 +0530
committer: Yunqing Wang <yunqingwang@google.com> Fri Jan 06 21:47:31 2023 +0000
tree: b386615d21905024602780291cfd305961fb8722
parent: ff2d57ad60950e268367f8b4b9bdc625c96376eb [diff]
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index a9c80040..32b9d4d 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c

@@ -192,20 +192,24 @@
     }
   }
 
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
   for (int i = 0, k = 0; i < block_height; i++) {
+    const int y_blk_raster_offset = (i >= block_height / 2) * 2;
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
       uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
       const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
+      const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
       const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+          weight_factor * window_error + subblock_mses_scaled[subblock_idx];
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
+      double scaled_error = combined_error * d_factor_decayed[subblock_idx];
       scaled_error = AOMMIN(scaled_error, 7);
       const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 

diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 8be7164..9bb7148 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c

@@ -168,20 +168,24 @@
     }
   }
 
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
   for (int i = 0, k = 0; i < block_height; i++) {
+    const int y_blk_raster_offset = (i >= block_height / 2) * 2;
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
       uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
       const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
+      const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
       const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+          weight_factor * window_error + subblock_mses_scaled[subblock_idx];
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
+      double scaled_error = combined_error * d_factor_decayed[subblock_idx];
       scaled_error = AOMMIN(scaled_error, 7);
       const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
commit	5ad2a273a60bd6fe8c453997ac2c8b6497b1be1c	[log] [tgz]
author	Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>	Fri Jan 06 17:20:42 2023 +0530
committer	Yunqing Wang <yunqingwang@google.com>	Fri Jan 06 21:47:31 2023 +0000
tree	b386615d21905024602780291cfd305961fb8722
parent	ff2d57ad60950e268367f8b4b9bdc625c96376eb [diff]