Add AVX2 intrinsic for av1_estimate_noise_from_single_plane()

This CL adds AVX2 intrinsic support for
av1_estimate_noise_from_single_plane() and relevant unit test.
Module level scaling w.r.t. C is ~5.0x.

Change-Id: Ic1ac8ad200870bea97ee93791cec708c764b25dd
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ccecc5e..9f9961a 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -404,9 +404,14 @@
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
+
+    add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
+    specialize qw/av1_estimate_noise_from_single_plane avx2/;
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
       add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
       specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
+
+      add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
     }
   }
 
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 85f2f95..a635149 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -744,9 +744,10 @@
                                !frame_params->show_existing_frame &&
                                !is_lossless_requested(&oxcf->rc_cfg);
       if (allow_kf_filtering) {
-        const double y_noise_level = av1_estimate_noise_from_single_plane(
-            frame_input->source, 0, cm->seq_params->bit_depth,
-            NOISE_ESTIMATION_EDGE_THRESHOLD);
+        double y_noise_level = 0.0;
+        av1_estimate_noise_level(
+            frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+            cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD);
         apply_filtering = y_noise_level > 0;
       } else {
         apply_filtering = 0;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index b89a881..24e8a0d 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4107,10 +4107,10 @@
       // No noise synthesis if source is very clean.
       // Uses a low edge threshold to focus on smooth areas.
       // Increase output noise setting a little compared to measured value.
-      cpi->oxcf.noise_level =
-          (float)(av1_estimate_noise_from_single_plane(
-                      sd, 0, cm->seq_params->bit_depth, 16) -
-                  0.1);
+      double y_noise_level = 0.0;
+      av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+                               cm->seq_params->bit_depth, 16);
+      cpi->oxcf.noise_level = (float)(y_noise_level - 0.1);
       cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
       if (cpi->oxcf.noise_level > 0.0) {
         cpi->oxcf.noise_level += (float)0.5;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index c7c05fd..050c701 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1008,11 +1008,9 @@
   const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
   const int num_planes = av1_num_planes(&cpi->common);
   double *noise_levels = tf_ctx->noise_levels;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    noise_levels[plane] = av1_estimate_noise_from_single_plane(
-        to_filter_frame, plane, cpi->common.seq_params->bit_depth,
-        NOISE_ESTIMATION_EDGE_THRESHOLD);
-  }
+  av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y,
+                           num_planes - 1, cpi->common.seq_params->bit_depth,
+                           NOISE_ESTIMATION_EDGE_THRESHOLD);
   // Get quantization factor.
   const int q = av1_get_q(cpi);
   // Get correlation estimates from first-pass;
@@ -1121,21 +1119,50 @@
 
 /*!\cond */
 
-// A constant number, sqrt(pi / 2),  used for noise estimation.
-static const double SQRT_PI_BY_2 = 1.25331413732;
+double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
+                                              int width, int stride,
+                                              int edge_thresh) {
+  int64_t accum = 0;
+  int count = 0;
 
-double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
-                                            const int plane,
-                                            const int bit_depth,
-                                            const int edge_thresh) {
-  const int is_y_plane = (plane == 0);
-  const int height = frame->crop_heights[is_y_plane ? 0 : 1];
-  const int width = frame->crop_widths[is_y_plane ? 0 : 1];
-  const int stride = frame->strides[is_y_plane ? 0 : 1];
-  const uint8_t *src = frame->buffers[plane];
-  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+  for (int i = 1; i < height - 1; ++i) {
+    for (int j = 1; j < width - 1; ++j) {
+      // Setup a small 3x3 matrix.
+      const int center_idx = i * stride + j;
+      int mat[3][3];
+      for (int ii = -1; ii <= 1; ++ii) {
+        for (int jj = -1; jj <= 1; ++jj) {
+          const int idx = center_idx + ii * stride + jj;
+          mat[ii + 1][jj + 1] = src[idx];
+        }
+      }
+      // Compute sobel gradients.
+      const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0);
+      // Accumulate Laplacian.
+      if (Ga < edge_thresh) {  // Only count smooth pixels.
+        const int v = 4 * mat[1][1] -
+                      2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                      (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+        accum += ROUND_POWER_OF_TWO(abs(v), 0);
+        ++count;
+      }
+    }
+  }
 
+  // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
+                                                   int height, int width,
+                                                   const int stride,
+                                                   int bit_depth,
+                                                   int edge_thresh) {
   int64_t accum = 0;
   int count = 0;
   for (int i = 1; i < height - 1; ++i) {
@@ -1146,7 +1173,7 @@
       for (int ii = -1; ii <= 1; ++ii) {
         for (int jj = -1; jj <= 1; ++jj) {
           const int idx = center_idx + ii * stride + jj;
-          mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
+          mat[ii + 1][jj + 1] = src16[idx];
         }
       }
       // Compute sobel gradients.
@@ -1169,6 +1196,35 @@
   // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
   return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
 }
+#endif
+
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+                              double *noise_level, int plane_from, int plane_to,
+                              int bit_depth, int edge_thresh) {
+  for (int plane = plane_from; plane <= plane_to; plane++) {
+    const bool is_uv_plane = (plane != AOM_PLANE_Y);
+    const int height = frame->crop_heights[is_uv_plane];
+    const int width = frame->crop_widths[is_uv_plane];
+    const int stride = frame->strides[is_uv_plane];
+    const uint8_t *src = frame->buffers[plane];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+    if (is_high_bitdepth) {
+      noise_level[plane] = av1_highbd_estimate_noise_from_single_plane(
+          src16, height, width, stride, bit_depth, edge_thresh);
+    } else {
+      noise_level[plane] = av1_estimate_noise_from_single_plane(
+          src, height, width, stride, edge_thresh);
+    }
+#else
+    (void)bit_depth;
+    noise_level[plane] = av1_estimate_noise_from_single_plane(
+        src, height, width, stride, edge_thresh);
+#endif
+  }
+}
 
 // Initializes the members of TemporalFilterCtx
 // Inputs:
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 725bd86..8aa4731 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -33,6 +33,9 @@
 // Window size for temporal filtering.
 #define TF_WINDOW_LENGTH 5
 
+// A constant number, sqrt(pi / 2),  used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
 // Hyper-parameters used to compute filtering weight. These hyper-parameters can
 // be tuned for a better performance.
 // 0. A scale factor used in temporal filtering to raise the filter weight from
@@ -268,15 +271,15 @@
 // Signal Processing, 2008, St Julians, Malta.
 // Inputs:
 //   frame: Pointer to the frame to estimate noise level from.
-//   plane: Index of the plane used for noise estimation. Commonly, 0 for
-//          Y-plane, 1 for U-plane, and 2 for V-plane.
+//   noise_level: Pointer to store the estimated noise.
+//   plane_from: Index of the starting plane used for noise estimation.
+//               Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane.
+//   plane_to: Index of the end plane used for noise estimation.
 //   bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
-// Returns:
-//   The estimated noise, or -1.0 if there are too few smooth pixels.
-double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
-                                            const int plane,
-                                            const int bit_depth,
-                                            const int edge_thresh);
+//   edge_thresh: Edge threshold.
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+                              double *noise_level, int plane_from, int plane_to,
+                              int bit_depth, int edge_thresh);
 /*!\endcond */
 
 /*!\brief Does temporal filter for a given macroblock row.
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 6dff786..752d6f3 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -30,6 +30,205 @@
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
 };
 
+#define CALC_X_GRADIENT(AC, GI, DF, out) \
+  out = _mm256_abs_epi16(                \
+      _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1)));
+
+#define CALC_Y_GRADIENT(AC, GI, BH, out) \
+  out = _mm256_abs_epi16(                \
+      _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1)));
+
+double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
+                                                 int width, int stride,
+                                                 int edge_thresh) {
+  int count = 0;
+  int64_t accum = 0;
+  // w32 stores width multiple of 32.
+  const int w32 = (width - 1) & ~0x1f;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh);
+  __m256i num_accumulator = zero;
+  __m256i sum_accumulator = zero;
+
+  //  A | B | C
+  //  D | E | F
+  //  G | H | I
+  // g_x = (A - C) + (G - I) + 2*(D - F)
+  // g_y = (A + C) - (G + I) + 2*(B - H)
+  // v   = 4*E - 2*(D+F+B+H) + (A+C+G+I)
+
+  // Process the width multiple of 32 here.
+  for (int w = 1; w < w32; w += 32) {
+    int h = 1;
+    const int start_idx = h * stride + w;
+    const int stride_0 = start_idx - stride;
+
+    __m256i num_accum_row_lvl = zero;
+    const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1]));
+    const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1]));
+    const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1]));
+    const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1]));
+    __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0]));
+    __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx]));
+
+    const __m256i A_lo = _mm256_unpacklo_epi8(A, zero);
+    const __m256i A_hi = _mm256_unpackhi_epi8(A, zero);
+    const __m256i C_lo = _mm256_unpacklo_epi8(C, zero);
+    const __m256i C_hi = _mm256_unpackhi_epi8(C, zero);
+    const __m256i D_lo = _mm256_unpacklo_epi8(D, zero);
+    const __m256i D_hi = _mm256_unpackhi_epi8(D, zero);
+    const __m256i F_lo = _mm256_unpacklo_epi8(F, zero);
+    const __m256i F_hi = _mm256_unpackhi_epi8(F, zero);
+
+    __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo);
+    __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi);
+    __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo);
+    __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi);
+    __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo);
+    __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi);
+    __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo);
+    __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi);
+
+    for (; h < height - 1; h++) {
+      __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi,
+          gy_hi;
+      const int k = h * stride + w;
+      const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1]));
+      const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride]));
+      const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1]));
+
+      const __m256i B_lo = _mm256_unpacklo_epi8(B, zero);
+      const __m256i B_hi = _mm256_unpackhi_epi8(B, zero);
+      const __m256i G_lo = _mm256_unpacklo_epi8(G, zero);
+      const __m256i G_hi = _mm256_unpackhi_epi8(G, zero);
+      const __m256i I_lo = _mm256_unpacklo_epi8(I, zero);
+      const __m256i I_hi = _mm256_unpackhi_epi8(I, zero);
+      const __m256i H_lo = _mm256_unpacklo_epi8(H, zero);
+      const __m256i H_hi = _mm256_unpackhi_epi8(H, zero);
+
+      sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo);
+      sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi);
+      sum_GI_lo = _mm256_add_epi16(G_lo, I_lo);
+      sum_GI_hi = _mm256_add_epi16(G_hi, I_hi);
+      const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo);
+      const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi);
+
+      CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo)
+      CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo)
+
+      const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo);
+
+      CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi)
+      CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi)
+
+      const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi);
+
+      __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo);
+      __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi);
+      const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi);
+
+      // v = 4*E -2*(D+F+B+H) + (A+C+G+I)
+      if (_mm256_movemask_epi8(comp_reg) != 0) {
+        const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo);
+        const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi);
+
+        // 2*(D+F+B+H)
+        const __m256i sum_DFBH_lo =
+            _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1);
+        // (A+C+G+I)
+        const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo);
+        const __m256i sum_DFBH_hi =
+            _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1);
+        const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi);
+
+        // Convert E register values from 8bit to 16bit
+        const __m256i E_lo = _mm256_unpacklo_epi8(E, zero);
+        const __m256i E_hi = _mm256_unpackhi_epi8(E, zero);
+
+        // 4*E - 2*(D+F+B+H)+ (A+C+G+I)
+        const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16(
+            _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo),
+            sum_ACGI_lo));
+        const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16(
+            _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi),
+            sum_ACGI_hi));
+        cmp_lo = _mm256_srli_epi16(cmp_lo, 15);
+        cmp_hi = _mm256_srli_epi16(cmp_hi, 15);
+        const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo);
+        const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi);
+
+        num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo);
+        num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi);
+
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpacklo_epi16(var_lo, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpackhi_epi16(var_lo, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpacklo_epi16(var_hi, zero));
+        sum_accumulator = _mm256_add_epi32(sum_accumulator,
+                                           _mm256_unpackhi_epi16(var_hi, zero));
+      }
+      sub_AC_lo = sub_DF_lo;
+      sub_AC_hi = sub_DF_hi;
+      sub_DF_lo = sub_GI_lo;
+      sub_DF_hi = sub_GI_hi;
+      sum_AC_lo = sum_DF_lo;
+      sum_AC_hi = sum_DF_hi;
+      sum_DF_lo = sum_GI_lo;
+      sum_DF_hi = sum_GI_hi;
+      B = E;
+      E = H;
+    }
+    const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero);
+    const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero);
+    num_accumulator =
+        _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1));
+  }
+
+  // Process the remaining width here.
+  for (int h = 1; h < height - 1; ++h) {
+    for (int w = w32 + 1; w < width - 1; ++w) {
+      const int k = h * stride + w;
+
+      // Compute sobel gradients
+      const int g_x = (src[k - stride - 1] - src[k - stride + 1]) +
+                      (src[k + stride - 1] - src[k + stride + 1]) +
+                      2 * (src[k - 1] - src[k + 1]);
+      const int g_y = (src[k - stride - 1] - src[k + stride - 1]) +
+                      (src[k - stride + 1] - src[k + stride + 1]) +
+                      2 * (src[k - stride] - src[k + stride]);
+      const int ga = abs(g_x) + abs(g_y);
+
+      if (ga < edge_thresh) {
+        // Find Laplacian
+        const int v =
+            4 * src[k] -
+            2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
+            (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
+             src[k + stride + 1]);
+        accum += abs(v);
+        ++count;
+      }
+    }
+  }
+
+  // s0 s1 n0 n1 s2 s3 n2 n3
+  __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator);
+  __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx);
+  __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1);
+  // s0+s2 s1+s3 n0+n2 n1+n3
+  __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi);
+  // s0+s2+s1+s3 n0+n2+n1+n3
+  __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1);
+
+  accum += _mm_cvtsi128_si32(result);
+  count += _mm_extract_epi32(result, 2);
+
+  // If very few smooth pels, return -1 since the estimate is unreliable.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
 static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 72e421e..0089023 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -308,6 +308,97 @@
                                  Values(0, 1)));
 #endif  // HAVE_NEON
 
+typedef double (*EstimateNoiseFunc)(const uint8_t *src, int height, int width,
+                                    int stride, int edge_thresh);
+
+typedef std::tuple<EstimateNoiseFunc, EstimateNoiseFunc, int, int>
+    EstimateNoiseWithParam;
+
+class EstimateNoiseTest
+    : public ::testing::TestWithParam<EstimateNoiseWithParam> {
+ public:
+  virtual ~EstimateNoiseTest() {}
+  virtual void SetUp() {
+    ref_func = GET_PARAM(0);
+    tst_func = GET_PARAM(1);
+    width_ = GET_PARAM(2);
+    height_ = GET_PARAM(3);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(8, sizeof(uint8_t) * width_ * height_));
+    GenRandomData(width_ * height_);
+    ASSERT_NE(src1_, nullptr);
+  }
+
+  virtual void TearDown() { aom_free(src1_); }
+
+  void RunTest(int run_times) {
+    stride_ = width_;
+
+    for (int i = 0; i < run_times; i++) {
+      double ref_out = ref_func(src1_, height_, width_, stride_,
+                                NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+      double tst_out = tst_func(src1_, height_, width_, stride_,
+                                NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+      EXPECT_EQ(ref_out, tst_out);
+    }
+  }
+
+  void SpeedTest(int run_times) {
+    stride_ = width_;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; i++) {
+      ref_func(src1_, height_, width_, stride_,
+               NOISE_ESTIMATION_EDGE_THRESHOLD);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; i++) {
+      tst_func(src1_, height_, width_, stride_,
+               NOISE_ESTIMATION_EDGE_THRESHOLD);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("(%3.2f)\n", time1 / time2);
+  }
+
+  void GenRandomData(int size) {
+    for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand8();
+  }
+
+ protected:
+  EstimateNoiseFunc ref_func;
+  EstimateNoiseFunc tst_func;
+  ACMRandom rnd_;
+  uint8_t *src1_;
+  int width_;
+  int height_;
+  int stride_;
+};
+
+TEST_P(EstimateNoiseTest, RandomValues) { RunTest(1); }
+
+TEST_P(EstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_AVX2
+// Width and height for which av1_estimate_noise_from_single_plane() will be
+// tested.
+int width[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
+int height[] = { 2160, 1080, 720, 600, 480, 240, 237 };
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, EstimateNoiseTest,
+    ::testing::Combine(
+        ::testing::Values(av1_estimate_noise_from_single_plane_c),
+        ::testing::Values(av1_estimate_noise_from_single_plane_avx2),
+        ::testing::ValuesIn(width), ::testing::ValuesIn(height)));
+#endif  // HAVE_AVX2
+
 #if CONFIG_AV1_HIGHBITDEPTH
 
 typedef void (*HBDTemporalFilterFunc)(