Refactor weight calculation code in apply_temporal_filter()

In apply_temporal_filter(), few of the variables related to
weight calculation were calculated for every pixel though
they are constant at sub-block level. In this CL, the
relevant calculations are done beforehand and the same are
consumed based on the sub-block index in both SSE2 and AVX2
variants of the function.

For 'good' encoding,
               Instruction Count        BD-Rate Loss(%)
cpu Resolution   Reduction(%)    avg.psnr  ovr.psnr   ssim
 5   LOWRES2       0.201         0.0000     0.0000    0.0000
 5   MIDRES2       0.263         0.0000     0.0000    0.0000
 5    HDRES2       0.321         0.0000     0.0000    0.0000
 6   LOWRES2       0.115         0.0000     0.0000    0.0000
 6   MIDRES2       0.243         0.0000     0.0000    0.0000
 6    HDRES2       0.296         0.0000     0.0000    0.0000

Change-Id: I486418ff972940ac5863856a7364eb5374ed2b38
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index a9c80040..32b9d4d 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -192,20 +192,24 @@
     }
   }
 
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
   for (int i = 0, k = 0; i < block_height; i++) {
+    const int y_blk_raster_offset = (i >= block_height / 2) * 2;
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
       uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
       const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
+      const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
       const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+          weight_factor * window_error + subblock_mses_scaled[subblock_idx];
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
+      double scaled_error = combined_error * d_factor_decayed[subblock_idx];
       scaled_error = AOMMIN(scaled_error, 7);
       const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 8be7164..9bb7148 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -168,20 +168,24 @@
     }
   }
 
+  double subblock_mses_scaled[4];
+  double d_factor_decayed[4];
+  for (int idx = 0; idx < 4; idx++) {
+    subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+    d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+  }
   for (int i = 0, k = 0; i < block_height; i++) {
+    const int y_blk_raster_offset = (i >= block_height / 2) * 2;
     for (int j = 0; j < block_width; j++, k++) {
       const int pixel_value = frame2[i * stride2 + j];
       uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
 
       const double window_error = diff_sse * inv_num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
+      const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
       const double combined_error =
-          weight_factor * window_error + block_error * inv_factor;
+          weight_factor * window_error + subblock_mses_scaled[subblock_idx];
 
-      double scaled_error =
-          combined_error * d_factor[subblock_idx] * decay_factor;
+      double scaled_error = combined_error * d_factor_decayed[subblock_idx];
       scaled_error = AOMMIN(scaled_error, 7);
       const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);