Add AVX2 intrinsic for av1_estimate_noise_from_single_plane()
This CL adds AVX2 intrinsic support for
av1_estimate_noise_from_single_plane() and relevant unit test.
Module level scaling w.r.t. C is ~5.0x.
Change-Id: Ic1ac8ad200870bea97ee93791cec708c764b25dd
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ccecc5e..9f9961a 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -404,9 +404,14 @@
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
specialize qw/av1_apply_temporal_filter sse2 avx2 neon/;
+
+ add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
+ specialize qw/av1_estimate_noise_from_single_plane avx2/;
if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
specialize qw/av1_highbd_apply_temporal_filter sse2 avx2/;
+
+ add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
}
}
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 85f2f95..a635149 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -744,9 +744,10 @@
!frame_params->show_existing_frame &&
!is_lossless_requested(&oxcf->rc_cfg);
if (allow_kf_filtering) {
- const double y_noise_level = av1_estimate_noise_from_single_plane(
- frame_input->source, 0, cm->seq_params->bit_depth,
- NOISE_ESTIMATION_EDGE_THRESHOLD);
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(
+ frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD);
apply_filtering = y_noise_level > 0;
} else {
apply_filtering = 0;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index b89a881..24e8a0d 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4107,10 +4107,10 @@
// No noise synthesis if source is very clean.
// Uses a low edge threshold to focus on smooth areas.
// Increase output noise setting a little compared to measured value.
- cpi->oxcf.noise_level =
- (float)(av1_estimate_noise_from_single_plane(
- sd, 0, cm->seq_params->bit_depth, 16) -
- 0.1);
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, 16);
+ cpi->oxcf.noise_level = (float)(y_noise_level - 0.1);
cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
if (cpi->oxcf.noise_level > 0.0) {
cpi->oxcf.noise_level += (float)0.5;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index c7c05fd..050c701 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1008,11 +1008,9 @@
const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
const int num_planes = av1_num_planes(&cpi->common);
double *noise_levels = tf_ctx->noise_levels;
- for (int plane = 0; plane < num_planes; ++plane) {
- noise_levels[plane] = av1_estimate_noise_from_single_plane(
- to_filter_frame, plane, cpi->common.seq_params->bit_depth,
- NOISE_ESTIMATION_EDGE_THRESHOLD);
- }
+ av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y,
+ num_planes - 1, cpi->common.seq_params->bit_depth,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
// Get quantization factor.
const int q = av1_get_q(cpi);
// Get correlation estimates from first-pass;
@@ -1121,21 +1119,50 @@
/*!\cond */
-// A constant number, sqrt(pi / 2), used for noise estimation.
-static const double SQRT_PI_BY_2 = 1.25331413732;
+double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int64_t accum = 0;
+ int count = 0;
-double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
- const int plane,
- const int bit_depth,
- const int edge_thresh) {
- const int is_y_plane = (plane == 0);
- const int height = frame->crop_heights[is_y_plane ? 0 : 1];
- const int width = frame->crop_widths[is_y_plane ? 0 : 1];
- const int stride = frame->strides[is_y_plane ? 0 : 1];
- const uint8_t *src = frame->buffers[plane];
- const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
- const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+ for (int i = 1; i < height - 1; ++i) {
+ for (int j = 1; j < width - 1; ++j) {
+ // Setup a small 3x3 matrix.
+ const int center_idx = i * stride + j;
+ int mat[3][3];
+ for (int ii = -1; ii <= 1; ++ii) {
+ for (int jj = -1; jj <= 1; ++jj) {
+ const int idx = center_idx + ii * stride + jj;
+ mat[ii + 1][jj + 1] = src[idx];
+ }
+ }
+ // Compute sobel gradients.
+ const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0);
+ // Accumulate Laplacian.
+ if (Ga < edge_thresh) { // Only count smooth pixels.
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ accum += ROUND_POWER_OF_TWO(abs(v), 0);
+ ++count;
+ }
+ }
+ }
+ // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double av1_highbd_estimate_noise_from_single_plane(const uint16_t *src16,
+ int height, int width,
+ const int stride,
+ int bit_depth,
+ int edge_thresh) {
int64_t accum = 0;
int count = 0;
for (int i = 1; i < height - 1; ++i) {
@@ -1146,7 +1173,7 @@
for (int ii = -1; ii <= 1; ++ii) {
for (int jj = -1; jj <= 1; ++jj) {
const int idx = center_idx + ii * stride + jj;
- mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
+ mat[ii + 1][jj + 1] = src16[idx];
}
}
// Compute sobel gradients.
@@ -1169,6 +1196,35 @@
// Return -1.0 (unreliable estimation) if there are too few smooth pixels.
return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
}
+#endif
+
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh) {
+ for (int plane = plane_from; plane <= plane_to; plane++) {
+ const bool is_uv_plane = (plane != AOM_PLANE_Y);
+ const int height = frame->crop_heights[is_uv_plane];
+ const int width = frame->crop_widths[is_uv_plane];
+ const int stride = frame->strides[is_uv_plane];
+ const uint8_t *src = frame->buffers[plane];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+ if (is_high_bitdepth) {
+ noise_level[plane] = av1_highbd_estimate_noise_from_single_plane(
+ src16, height, width, stride, bit_depth, edge_thresh);
+ } else {
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+ }
+#else
+ (void)bit_depth;
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+#endif
+ }
+}
// Initializes the members of TemporalFilterCtx
// Inputs:
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 725bd86..8aa4731 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -33,6 +33,9 @@
// Window size for temporal filtering.
#define TF_WINDOW_LENGTH 5
+// A constant number, sqrt(pi / 2), used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
// Hyper-parameters used to compute filtering weight. These hyper-parameters can
// be tuned for a better performance.
// 0. A scale factor used in temporal filtering to raise the filter weight from
@@ -268,15 +271,15 @@
// Signal Processing, 2008, St Julians, Malta.
// Inputs:
// frame: Pointer to the frame to estimate noise level from.
-// plane: Index of the plane used for noise estimation. Commonly, 0 for
-// Y-plane, 1 for U-plane, and 2 for V-plane.
+// noise_level: Pointer to store the estimated noise.
+// plane_from: Index of the starting plane used for noise estimation.
+// Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane.
+// plane_to: Index of the end plane used for noise estimation.
// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
-// Returns:
-// The estimated noise, or -1.0 if there are too few smooth pixels.
-double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
- const int plane,
- const int bit_depth,
- const int edge_thresh);
+// edge_thresh: Edge threshold.
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh);
/*!\endcond */
/*!\brief Does temporal filter for a given macroblock row.
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 6dff786..752d6f3 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -30,6 +30,205 @@
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
};
+#define CALC_X_GRADIENT(AC, GI, DF, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1)));
+
+#define CALC_Y_GRADIENT(AC, GI, BH, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1)));
+
+double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int count = 0;
+ int64_t accum = 0;
+ // w32 stores width multiple of 32.
+ const int w32 = (width - 1) & ~0x1f;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh);
+ __m256i num_accumulator = zero;
+ __m256i sum_accumulator = zero;
+
+ // A | B | C
+ // D | E | F
+ // G | H | I
+ // g_x = (A - C) + (G - I) + 2*(D - F)
+ // g_y = (A + C) - (G + I) + 2*(B - H)
+ // v = 4*E - 2*(D+F+B+H) + (A+C+G+I)
+
+ // Process the width multiple of 32 here.
+ for (int w = 1; w < w32; w += 32) {
+ int h = 1;
+ const int start_idx = h * stride + w;
+ const int stride_0 = start_idx - stride;
+
+ __m256i num_accum_row_lvl = zero;
+ const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1]));
+ const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1]));
+ const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1]));
+ const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1]));
+ __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0]));
+ __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx]));
+
+ const __m256i A_lo = _mm256_unpacklo_epi8(A, zero);
+ const __m256i A_hi = _mm256_unpackhi_epi8(A, zero);
+ const __m256i C_lo = _mm256_unpacklo_epi8(C, zero);
+ const __m256i C_hi = _mm256_unpackhi_epi8(C, zero);
+ const __m256i D_lo = _mm256_unpacklo_epi8(D, zero);
+ const __m256i D_hi = _mm256_unpackhi_epi8(D, zero);
+ const __m256i F_lo = _mm256_unpacklo_epi8(F, zero);
+ const __m256i F_hi = _mm256_unpackhi_epi8(F, zero);
+
+ __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo);
+ __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi);
+ __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo);
+ __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi);
+ __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo);
+ __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi);
+ __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo);
+ __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi);
+
+ for (; h < height - 1; h++) {
+ __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi,
+ gy_hi;
+ const int k = h * stride + w;
+ const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1]));
+ const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride]));
+ const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1]));
+
+ const __m256i B_lo = _mm256_unpacklo_epi8(B, zero);
+ const __m256i B_hi = _mm256_unpackhi_epi8(B, zero);
+ const __m256i G_lo = _mm256_unpacklo_epi8(G, zero);
+ const __m256i G_hi = _mm256_unpackhi_epi8(G, zero);
+ const __m256i I_lo = _mm256_unpacklo_epi8(I, zero);
+ const __m256i I_hi = _mm256_unpackhi_epi8(I, zero);
+ const __m256i H_lo = _mm256_unpacklo_epi8(H, zero);
+ const __m256i H_hi = _mm256_unpackhi_epi8(H, zero);
+
+ sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo);
+ sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi);
+ sum_GI_lo = _mm256_add_epi16(G_lo, I_lo);
+ sum_GI_hi = _mm256_add_epi16(G_hi, I_hi);
+ const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo);
+ const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi);
+
+ CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo)
+ CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo)
+
+ const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo);
+
+ CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi)
+ CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi)
+
+ const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi);
+
+ __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo);
+ __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi);
+ const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi);
+
+ // v = 4*E -2*(D+F+B+H) + (A+C+G+I)
+ if (_mm256_movemask_epi8(comp_reg) != 0) {
+ const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo);
+ const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi);
+
+ // 2*(D+F+B+H)
+ const __m256i sum_DFBH_lo =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1);
+ // (A+C+G+I)
+ const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo);
+ const __m256i sum_DFBH_hi =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1);
+ const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi);
+
+ // Convert E register values from 8bit to 16bit
+ const __m256i E_lo = _mm256_unpacklo_epi8(E, zero);
+ const __m256i E_hi = _mm256_unpackhi_epi8(E, zero);
+
+ // 4*E - 2*(D+F+B+H)+ (A+C+G+I)
+ const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo),
+ sum_ACGI_lo));
+ const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi),
+ sum_ACGI_hi));
+ cmp_lo = _mm256_srli_epi16(cmp_lo, 15);
+ cmp_hi = _mm256_srli_epi16(cmp_hi, 15);
+ const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo);
+ const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi);
+
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo);
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi);
+
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_hi, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_hi, zero));
+ }
+ sub_AC_lo = sub_DF_lo;
+ sub_AC_hi = sub_DF_hi;
+ sub_DF_lo = sub_GI_lo;
+ sub_DF_hi = sub_GI_hi;
+ sum_AC_lo = sum_DF_lo;
+ sum_AC_hi = sum_DF_hi;
+ sum_DF_lo = sum_GI_lo;
+ sum_DF_hi = sum_GI_hi;
+ B = E;
+ E = H;
+ }
+ const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero);
+ const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero);
+ num_accumulator =
+ _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1));
+ }
+
+ // Process the remaining width here.
+ for (int h = 1; h < height - 1; ++h) {
+ for (int w = w32 + 1; w < width - 1; ++w) {
+ const int k = h * stride + w;
+
+ // Compute sobel gradients
+ const int g_x = (src[k - stride - 1] - src[k - stride + 1]) +
+ (src[k + stride - 1] - src[k + stride + 1]) +
+ 2 * (src[k - 1] - src[k + 1]);
+ const int g_y = (src[k - stride - 1] - src[k + stride - 1]) +
+ (src[k - stride + 1] - src[k + stride + 1]) +
+ 2 * (src[k - stride] - src[k + stride]);
+ const int ga = abs(g_x) + abs(g_y);
+
+ if (ga < edge_thresh) {
+ // Find Laplacian
+ const int v =
+ 4 * src[k] -
+ 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
+ (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
+ src[k + stride + 1]);
+ accum += abs(v);
+ ++count;
+ }
+ }
+ }
+
+ // s0 s1 n0 n1 s2 s3 n2 n3
+ __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator);
+ __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx);
+ __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1);
+ // s0+s2 s1+s3 n0+n2 n1+n3
+ __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi);
+ // s0+s2+s1+s3 n0+n2+n1+n3
+ __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1);
+
+ accum += _mm_cvtsi128_si32(result);
+ count += _mm_extract_epi32(result, 2);
+
+ // If very few smooth pels, return -1 since the estimate is unreliable.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
const unsigned int stride2, const int block_width, const int block_height,
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 72e421e..0089023 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -308,6 +308,97 @@
Values(0, 1)));
#endif // HAVE_NEON
+typedef double (*EstimateNoiseFunc)(const uint8_t *src, int height, int width,
+ int stride, int edge_thresh);
+
+typedef std::tuple<EstimateNoiseFunc, EstimateNoiseFunc, int, int>
+ EstimateNoiseWithParam;
+
+class EstimateNoiseTest
+ : public ::testing::TestWithParam<EstimateNoiseWithParam> {
+ public:
+ virtual ~EstimateNoiseTest() {}
+ virtual void SetUp() {
+ ref_func = GET_PARAM(0);
+ tst_func = GET_PARAM(1);
+ width_ = GET_PARAM(2);
+ height_ = GET_PARAM(3);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src1_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * width_ * height_));
+ GenRandomData(width_ * height_);
+ ASSERT_NE(src1_, nullptr);
+ }
+
+ virtual void TearDown() { aom_free(src1_); }
+
+ void RunTest(int run_times) {
+ stride_ = width_;
+
+ for (int i = 0; i < run_times; i++) {
+ double ref_out = ref_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ double tst_out = tst_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ EXPECT_EQ(ref_out, tst_out);
+ }
+ }
+
+ void SpeedTest(int run_times) {
+ stride_ = width_;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ ref_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ tst_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("(%3.2f)\n", time1 / time2);
+ }
+
+ void GenRandomData(int size) {
+ for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand8();
+ }
+
+ protected:
+ EstimateNoiseFunc ref_func;
+ EstimateNoiseFunc tst_func;
+ ACMRandom rnd_;
+ uint8_t *src1_;
+ int width_;
+ int height_;
+ int stride_;
+};
+
+TEST_P(EstimateNoiseTest, RandomValues) { RunTest(1); }
+
+TEST_P(EstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_AVX2
+// Width and height for which av1_estimate_noise_from_single_plane() will be
+// tested.
+int width[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
+int height[] = { 2160, 1080, 720, 600, 480, 240, 237 };
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, EstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_estimate_noise_from_single_plane_avx2),
+ ::testing::ValuesIn(width), ::testing::ValuesIn(height)));
+#endif // HAVE_AVX2
+
#if CONFIG_AV1_HIGHBITDEPTH
typedef void (*HBDTemporalFilterFunc)(