Add compute_stats SSE4_1 and AVX2 code
1. add compute_stats unittest
2. add compute_stats SSE4_1
3. add compute_stats AVX2
Encoder speedup about 0.9% without rd change
test sequence: BasketballDrill_832x480_50.y4m
test command line:./aomenc --cpu-used=1 --psnr -D \
-q --end-usage=vbr --target-bitrate=800 --limit=20 \
BasketballDrill_832x480_50.y4m -otest.webm
Change-Id: Ic0799997c1075a139869b45ba84af00c5475964a
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 4c4f542..e3e100c 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -268,7 +268,8 @@
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
- "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
@@ -276,7 +277,8 @@
"${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
- "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index fa8b349..c1eb1d0 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -76,12 +76,12 @@
specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
+
# directional intra predictor functions
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
-
# FILTER_INTRA predictor functions
add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
specialize qw/av1_filter_intra_predictor sse4_1/;
@@ -251,6 +251,8 @@
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
specialize qw/av1_get_crc32c_value sse4_2/;
+ add_proto qw/void compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H";
+ specialize qw/compute_stats sse4_1 avx2/;
}
# end encoder functions
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index aec37d8..b97f514 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -120,6 +120,7 @@
// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
#define WIENER_FILT_PREC_BITS 7
#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index d1ac8de..3b4831b 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -15,6 +15,7 @@
#include <math.h>
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/binary_codes_writer.h"
@@ -22,7 +23,6 @@
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
-
#include "av1/common/onyxc_int.h"
#include "av1/common/quant_common.h"
#include "av1/common/restoration.h"
@@ -588,22 +588,9 @@
if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
}
-static double find_average(const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int stride) {
- uint64_t sum = 0;
- double avg = 0;
- int i, j;
- aom_clear_system_state();
- for (i = v_start; i < v_end; i++)
- for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
- avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
- return avg;
-}
-
-static void compute_stats(int wiener_win, const uint8_t *dgd,
- const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int dgd_stride,
- int src_stride, double *M, double *H) {
+void compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
int i, j, k, l;
double Y[WIENER_WIN2];
const int wiener_win2 = wiener_win * wiener_win;
@@ -626,8 +613,7 @@
assert(idx == wiener_win2);
for (k = 0; k < wiener_win2; ++k) {
M[k] += Y[k] * X;
- H[k * wiener_win2 + k] += Y[k] * Y[k];
- for (l = k + 1; l < wiener_win2; ++l) {
+ for (l = k; l < wiener_win2; ++l) {
// H is a symmetric matrix, so we only need to fill out the upper
// triangle here. We can copy it down to the lower triangle outside
// the (i, j) loops.
diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index 179b89f..e92f8c6 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h
@@ -16,10 +16,27 @@
#endif
#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
struct yv12_buffer_config;
struct AV1_COMP;
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
#ifdef __cplusplus
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 0000000..cd970c0
--- /dev/null
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m256i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+ const __m256i dst0 = yy_loadu_256(dst);
+ const __m256i r0 = _mm256_add_epi32(dst0, d0);
+ yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void compute_stats_avx2(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 0000000..383aeef
--- /dev/null
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
diff --git a/test/test.cmake b/test/test.cmake
index 7b58488..322e42b 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -71,6 +71,7 @@
"${AOM_ROOT}/test/qm_test.cc"
"${AOM_ROOT}/test/resize_test.cc"
"${AOM_ROOT}/test/scalability_test.cc"
+ "${AOM_ROOT}/test/wiener_test.cc"
"${AOM_ROOT}/test/y4m_test.cc"
"${AOM_ROOT}/test/y4m_video_source.h"
"${AOM_ROOT}/test/yuv_video_source.h")
diff --git a/test/wiener_test.cc b/test/wiener_test.cc
new file mode 100644
index 0000000..139271f
--- /dev/null
+++ b/test/wiener_test.cc
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_WIENER_BLOCK 384
+#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static void compute_stats_win5_opt_c(const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride,
+ double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t H_int[WIENER_WIN_CHROMA * WIENER_WIN_CHROMA][WIENER_WIN_CHROMA * 8] =
+ { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[i * src_stride + j];
+ const uint8_t X2 = src[i * src_stride + j + 1];
+ sumX += X1 + X2;
+
+ const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijkl[0];
+ const uint8_t D2 = dgd_ijkl[1];
+ sumY[k][l] += D1 + D2;
+ M_int[l][k] += D1 * X1 + D2 * X2;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+ D2 * dgd_ij[n + dgd_stride * m + 1];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ M[l * wiener_win + k] =
+ M_int[l][k] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+ H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static void compute_stats_win7_opt_c(const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride,
+ double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t H_int[WIENER_WIN * WIENER_WIN][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[i * src_stride + j];
+ const uint8_t X2 = src[i * src_stride + j + 1];
+ sumX += X1 + X2;
+
+ const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijkl[0];
+ const uint8_t D2 = dgd_ijkl[1];
+ sumY[k][l] += D1 + D2;
+ M_int[l][k] += D1 * X1 + D2 * X2;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+ D2 * dgd_ij[n + dgd_stride * m + 1];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ M[l * wiener_win + k] =
+ M_int[l][k] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+ H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_c(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_c(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static const int kIterations = 100;
+static const double min_error = (double)(0.01);
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H);
+
+typedef libaom_test::FuncParam<compute_stats_Func> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef ::testing::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+ virtual void SetUp() { target_func_ = GET_PARAM(0); }
+ void runWienerTest(const int32_t wiener_win, int32_t run_times);
+ void runWienerTest_ExtremeValues(const int32_t wiener_win);
+
+ private:
+ compute_stats_Func target_func_;
+ ACMRandom rng_;
+};
+
+void WienerTest::runWienerTest(const int32_t wiener_win, int32_t run_times) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+ int h_end =
+ run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+ const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+ int v_end =
+ run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = run_times == 1 ? kIterations : 2;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = rng_.Rand8();
+ src_buf[i] = rng_.Rand8();
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_ref, H_ref);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+ time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (fabs(M_ref[i] - M_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ // ASSERT_EQ(failed, 0);
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (fabs(H_ref[i] - H_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+void WienerTest::runWienerTest_ExtremeValues(const int32_t wiener_win) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = 16;
+ const int h_end = MAX_WIENER_BLOCK;
+ const int v_start = 16;
+ const int v_end = MAX_WIENER_BLOCK;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = 1;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = 255;
+ src_buf[i] = 255;
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+
+ compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_ref, H_ref);
+
+ target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test);
+
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (fabs(M_ref[i] - M_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ // ASSERT_EQ(failed, 0);
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (fabs(H_ref[i] - H_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+TEST_P(WienerTest, RandomValues) {
+ runWienerTest(WIENER_WIN, 1);
+ runWienerTest(WIENER_WIN_CHROMA, 1);
+}
+
+TEST_P(WienerTest, ExtremeValues) {
+ runWienerTest_ExtremeValues(WIENER_WIN);
+ runWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
+}
+
+TEST_P(WienerTest, DISABLED_Speed) {
+ runWienerTest(WIENER_WIN, 200);
+ runWienerTest(WIENER_WIN_CHROMA, 200);
+}
+
+INSTANTIATE_TEST_CASE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, WienerTest,
+ ::testing::Values(compute_stats_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, WienerTest,
+ ::testing::Values(compute_stats_avx2));
+#endif // HAVE_AVX2
+
+} // namespace