rtc: Fix to 4x4avg in variance partition.

If 4x4avg is used in variance partition we need
to avoid the superblock boundary (by 4 pixels) if
temporal filtering is enabled. This is because the
4x4avg will load 8 pixels and temporal filter may
change source data, so when multi-threading is enabled
it will cause tsan race issue.

This is the cause of the issue in: Bug:aomedia:3331

No stats changed as 4x4avg is currently only used
on key frames, which does not have temporal filtering
or denoising.

Change-Id: Ie1df4c25c34924fd243bab935ffd232f9c89f4b4
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 94ddfa9..6d29bb7 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -383,14 +383,16 @@
                                             int highbd_flag,
 #endif
                                             int pixels_wide, int pixels_high,
-                                            int is_key_frame) {
+                                            int is_key_frame,
+                                            int border_offset_4x4) {
   int k;
   for (k = 0; k < 4; k++) {
     int x4_idx = x8_idx + ((k & 1) << 2);
     int y4_idx = y8_idx + ((k >> 1) << 2);
     unsigned int sse = 0;
     int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+    if (x4_idx < pixels_wide - border_offset_4x4 &&
+        y4_idx < pixels_high - border_offset_4x4) {
       int s_avg;
       int d_avg = 128;
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1021,13 +1023,23 @@
   const int compute_minmax_variance = 0;
   const int segment_id = xd->mi[0]->segment_id;
   int pixels_wide = 128, pixels_high = 128;
-
+  int border_offset_4x4 = 0;
+  int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf;
   if (is_small_sb) {
     pixels_wide = 64;
     pixels_high = 64;
   }
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+  temporal_denoising |= cpi->oxcf.noise_sensitivity;
+#endif
+  // For temporal filtering or temporal denoiser enabled: since the source
+  // is modified we need to avoid 4x4 avg along superblock boundary, since
+  // simd code will load 8 pixels for 4x4 avg and so can access source
+  // data outside superblock (while its being modified by temporal filter).
+  // Temporal filtering is never done on key frames.
+  if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4;
   for (int m = 0; m < num_64x64_blocks; m++) {
     const int x64_idx = ((m & 1) << 6);
     const int y64_idx = ((m >> 1) << 6);
@@ -1107,12 +1119,12 @@
             int x8_idx = x16_idx + ((k & 1) << 3);
             int y8_idx = y16_idx + ((k >> 1) << 3);
             VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
-            fill_variance_4x4avg(src, src_stride, dst, dst_stride, x8_idx,
-                                 y8_idx, vst2,
+            fill_variance_4x4avg(
+                src, src_stride, dst, dst_stride, x8_idx, y8_idx, vst2,
 #if CONFIG_AV1_HIGHBITDEPTH
-                                 xd->cur_buf->flags,
+                xd->cur_buf->flags,
 #endif
-                                 pixels_wide, pixels_high, is_key_frame);
+                pixels_wide, pixels_high, is_key_frame, border_offset_4x4);
           }
         }
       }