Enable av1_temporal_filter_planewise intrinsics

The AVX2 and SSE2 varaints of av1_temporal_filter_planewise
are updated to comply with the modified C implementation.

                   Encode Time
       cpu-used     Reduction
           5          5.69%
           4          3.50%
           3          2.64%
           2          1.66%
           1          0.88%

Change-Id: If85689b6ab3eb1f1fd55027bf5abf7bfb5b3b3a5
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f04cd0f..93a7066 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -952,12 +952,16 @@
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   if (use_planewise_strategy) {  // Commonly used for high-resolution video.
-    // TODO(any): avx2 and sse2 version should also support high bit-depth, and
-    // they should be changed to consider cross-plane information (see C
-    // function) before using.
-    av1_apply_temporal_filter_planewise_c(frame_to_filter, mbd, block_size,
+    // TODO(any): avx2 and sse2 version should also support high bit-depth.
+    if (is_frame_high_bitdepth(frame_to_filter)) {
+      av1_apply_temporal_filter_planewise_c(frame_to_filter, mbd, block_size,
+                                            mb_row, mb_col, num_planes,
+                                            noise_levels, pred, accum, count);
+    } else {
+      av1_apply_temporal_filter_planewise(frame_to_filter, mbd, block_size,
                                           mb_row, mb_col, num_planes,
                                           noise_levels, pred, accum, count);
+    }
   } else {  // Commonly used for low-resolution video.
     const int adj_strength = strength + 2 * (mbd->bd - 8);
     if (num_planes == 1) {
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 93d9186..88a74a2 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -19,10 +19,10 @@
 #define SSE_STRIDE (BW + 2)
 
 DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
-  { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000 },
-  { 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000 },
-  { 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000 },
-  { 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
 };
 
 DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
@@ -104,7 +104,7 @@
     // For the last column, replicate the last element twice to the right
     v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
   }
-  return _mm256_cvtepi16_epi32(v128tmp);
+  return _mm256_cvtepu16_epi32(v128tmp);
 }
 
 static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
@@ -131,16 +131,17 @@
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
     const double sigma, const int decay_control, unsigned int *accumulator,
-    uint16_t *count) {
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double beta = 1.0;
-
-  uint16_t frame_sse[SSE_STRIDE * BH];
-  uint32_t acc_5x5_sse[BH][BW];
-
+    uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
+    int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   if (block_width == 32) {
     get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
@@ -200,17 +201,29 @@
       const int pixel_value = frame2[i * stride2 + j];
 
       int diff_sse = acc_5x5_sse[i][j];
-      diff_sse /= (TF_PLANEWISE_FILTER_WINDOW_LENGTH *
-                   TF_PLANEWISE_FILTER_WINDOW_LENGTH);
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
 
-      double scaled_diff = -diff_sse / (2 * beta * h * h);
-      // clamp the value to avoid underflow in exp()
-      if (scaled_diff < -15) scaled_diff = -15;
-      double w = exp(scaled_diff);
-      const int weight = (int)(w * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
+      const double scaled_diff =
+          AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
     }
   }
 }
@@ -232,6 +245,12 @@
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
@@ -239,9 +258,16 @@
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
     const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
         noise_levels[plane], decay_control, accum + mb_pels * plane,
-        count + mb_pels * plane);
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
   }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 0cfa841..5b85dca 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -20,10 +20,14 @@
 #define SSE_STRIDE (BW + 4)
 
 DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
-  { { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, { 0xFFFF, 0x0000, 0x0000, 0x0000 } },
-  { { 0x0000, 0xFFFF, 0xFFFF, 0xFFFF }, { 0xFFFF, 0xFFFF, 0x0000, 0x0000 } },
-  { { 0x0000, 0x0000, 0xFFFF, 0xFFFF }, { 0xFFFF, 0xFFFF, 0xFFFF, 0x0000 } },
-  { { 0x0000, 0x0000, 0x0000, 0xFFFF }, { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF } }
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
 };
 
 static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
@@ -102,16 +106,17 @@
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
     const double sigma, const int decay_control, unsigned int *accumulator,
-    uint16_t *count) {
-  const double h = decay_control * (0.7 + log(sigma + 1.0));
-  const double beta = 1.0;
-
-  uint16_t frame_sse[SSE_STRIDE * BH];
-  uint32_t acc_5x5_sse[BH][BW];
-
+    uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
+    int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
   get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
                     frame_sse, SSE_STRIDE);
@@ -173,17 +178,31 @@
       const int pixel_value = frame2[i * stride2 + j];
 
       int diff_sse = acc_5x5_sse[i][j];
-      diff_sse /= (TF_PLANEWISE_FILTER_WINDOW_LENGTH *
-                   TF_PLANEWISE_FILTER_WINDOW_LENGTH);
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
 
-      double scaled_diff = -diff_sse / (2 * beta * h * h);
-      // clamp the value to avoid underflow in exp()
-      if (scaled_diff < -15) scaled_diff = -15;
-      double w = exp(scaled_diff);
-      const int weight = (int)(w * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
+            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
+            diff_sse += luma_sq_error[yy * ww + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
 
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
+      const double scaled_diff =
+          AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
     }
   }
 }
@@ -205,6 +224,12 @@
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
@@ -212,9 +237,16 @@
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
     const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
         noise_levels[plane], decay_control, accum + mb_pels * plane,
-        count + mb_pels * plane);
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
   }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }