Add High bit-depth SSE2 Temporal Filter Upadte cmake file for removal of SSE2 Highbd code for LBD 5.4x performance compared to C-code Change-Id: I72f29aa6ca5916c9b4774c2ec1abe0a877b8cadf
diff --git a/av1/av1.cmake b/av1/av1.cmake index 66e3798..ed4265f 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -341,12 +341,14 @@ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") if(NOT CONFIG_AV1_HIGHBITDEPTH) list( REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2 - "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c") + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c") endif() list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 24171de..fc2720e 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -354,6 +354,10 @@ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count"; specialize qw/av1_apply_temporal_filter sse2 avx2/; + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count"; + specialize qw/av1_highbd_apply_temporal_filter sse2/; + } } add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c index 112cd32..d98439c 100644 --- a/av1/encoder/temporal_filter.c +++ b/av1/encoder/temporal_filter.c
@@ -626,7 +626,20 @@ aom_free(square_diff); } - +#if CONFIG_AV1_HIGHBITDEPTH +// Calls High bit-depth temporal filter +void av1_highbd_apply_temporal_filter_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col, + num_planes, noise_levels, subblock_mvs, + subblock_mses, q_factor, filter_strength, pred, + accum, count); +} +#endif // CONFIG_AV1_HIGHBITDEPTH /*!\brief Normalizes the accumulated filtering result to produce the filtered * frame * @@ -774,7 +787,6 @@ break; } } - // Setup. mbd->block_ref_scale_factors[0] = scale; mbd->block_ref_scale_factors[1] = scale; @@ -835,17 +847,36 @@ // only supports 32x32 block size, 5x5 filtering window, 8-bit // encoding, and the case when the video is not with `YUV 4:2:2` // format. - if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 && - !is_frame_high_bitdepth(frame_to_filter) && !is_yuv422_format) { - av1_apply_temporal_filter(frame_to_filter, mbd, block_size, mb_row, - mb_col, num_planes, noise_levels, - subblock_mvs, subblock_mses, q_factor, - filter_strength, pred, accum, count); - } else { - av1_apply_temporal_filter_c( - frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, - noise_levels, subblock_mvs, subblock_mses, q_factor, - filter_strength, pred, accum, count); + if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth +#if CONFIG_AV1_HIGHBITDEPTH + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 && + !is_yuv422_format) { + av1_highbd_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, pred, accum, count); + } else { +#endif // CONFIG_AV1_HIGHBITDEPTH + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, pred, accum, count); +#if CONFIG_AV1_HIGHBITDEPTH + } +#endif // CONFIG_AV1_HIGHBITDEPTH + } else { // for 8-bit + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 && + !is_yuv422_format) { + av1_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, pred, accum, count); + } else { + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, pred, accum, count); + } } } }
diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c new file mode 100644 index 0000000..8e6e43d --- /dev/null +++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,287 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> + +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint8_t *frame1, const unsigned int stride, + const uint8_t *frame2, const unsigned int stride2, + const int block_width, const int block_height, + uint32_t *frame_sse, + const unsigned int dst_stride) { + const uint16_t *src1 = CONVERT_TO_SHORTPTR(frame1); + const uint16_t *src2 = CONVERT_TO_SHORTPTR(frame2); + uint32_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 8) { + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2); + __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff); + __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff); + + __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh); + __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 6), vres2); + } + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp1 = _mm_loadu_si128((__m128i *)src); + __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4)); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void highbd_apply_temporal_filter( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int min_frame_size, const double sigma, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + unsigned int *accumulator, uint16_t *count, uint32_t *luma_sq_error, + uint32_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift, + int bd) { + assert(((block_width == 32) && (block_height == 32)) || + ((block_width == 16) && (block_height == 16))); + if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL); + + uint32_t acc_5x5_sse[BH][BW]; + uint32_t *frame_sse = + (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + const double n_decay = 0.5 + log(2 * sigma + 5.0); + const double q_decay = + CLIP(pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2), 1e-5, 1); + const double s_decay = + CLIP(pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2), 1e-5, 1); + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint32_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height - 4; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + for (int row = block_height - 4; row < block_height; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + } + + uint16_t *frame2s = CONVERT_TO_SHORTPTR(frame2); + + for (int i = 0, k = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2s[i * stride2 + j]; + + int diff_sse = acc_5x5_sse[i][j]; + int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH; + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. + if (plane != PLANE_TYPE_Y) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj + 2; // X-coord on Y-plane. + const int ww = SSE_STRIDE; // Stride of Y-plane. + diff_sse += luma_sq_error[yy * ww + xx]; + ++num_ref_pixels; + } + } + } + + // Scale down the difference for high bit depth input. + diff_sse >>= (bd - 8) * (bd - 8); + + const double window_error = (double)(diff_sse) / num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + (TF_WINDOW_BLOCK_BALANCE_WEIGHT * window_error + block_error) / + (TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) / TF_SEARCH_ERROR_NORM_WEIGHT; + + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + const double distance_threshold = + (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1); + const double d_factor = AOMMAX(distance / distance_threshold, 1); + + const double scaled_error = + AOMMIN(combined_error * d_factor / n_decay / q_decay / s_decay, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } +} + +void av1_highbd_apply_temporal_filter_sse2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + const uint8_t *pred, uint32_t *accum, uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + uint32_t luma_sq_error[SSE_STRIDE * BH]; + uint32_t *chroma_sq_error = + (num_planes > 0) + ? (uint32_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint32_t)) + : NULL; + + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; + + highbd_apply_temporal_filter( + ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h, + min_frame_size, noise_levels[plane], subblock_mvs, subblock_mses, + q_factor, filter_strength, accum + mb_pels * plane, + count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane, + ss_x_shift, ss_y_shift, mbd->bd); + } + if (chroma_sq_error != NULL) aom_free(chroma_sq_error); +}
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc index 503f715..1badff1 100644 --- a/test/temporal_filter_test.cc +++ b/test/temporal_filter_test.cc
@@ -234,6 +234,222 @@ Combine(ValuesIn(temporal_filter_test_sse2), Range(64, 65, 4))); #endif // HAVE_SSE2 +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*HBDTemporalFilterFunc)( + const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_level, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strenght, + const uint8_t *pred, uint32_t *accum, uint16_t *count); +typedef libaom_test::FuncParam<HBDTemporalFilterFunc> + HBDTemporalFilterFuncParam; + +typedef std::tuple<HBDTemporalFilterFuncParam, int> HBDTemporalFilterWithParam; + +class HBDTemporalFilterTest + : public ::testing::TestWithParam<HBDTemporalFilterWithParam> { + public: + virtual ~HBDTemporalFilterTest() {} + virtual void SetUp() { + params_ = GET_PARAM(0); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src1_ = reinterpret_cast<uint16_t *>(aom_memalign(16, 256 * 256)); + src2_ = reinterpret_cast<uint16_t *>(aom_memalign(16, 256 * 256)); + + ASSERT_TRUE(src1_ != NULL); + ASSERT_TRUE(src2_ != NULL); + } + + virtual void TearDown() { + libaom_test::ClearSystemState(); + aom_free(src1_); + aom_free(src2_); + } + void RunTest(int isRandom, int width, int height, int run_times, int bd); + + void GenRandomData(int width, int height, int stride, int stride2, int bd) { + if (bd == 10) { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src1_[ii * stride + jj] = rnd_.Rand16() & 0x3FF; + src2_[ii * stride2 + jj] = rnd_.Rand16() & 0x3FF; + } + } + } else { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + src1_[ii * stride + jj] = rnd_.Rand16() & 0xFFF; + src2_[ii * stride2 + jj] = rnd_.Rand16() & 0xFFF; + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint16_t *data, + int stride2, uint16_t *data2, uint16_t val, int bd) { + if (bd == 10) { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + data[ii * stride + jj] = val; + data2[ii * stride2 + jj] = (1023 - val); + } + } + } else { + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + data[ii * stride + jj] = val; + data2[ii * stride2 + jj] = (4095 - val); + } + } + } + } + + protected: + HBDTemporalFilterFuncParam params_; + uint16_t *src1_; + uint16_t *src2_; + ACMRandom rnd_; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDTemporalFilterTest); + +void HBDTemporalFilterTest::RunTest(int isRandom, int width, int height, + int run_times, int BD) { + aom_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + const int stride = width; + const int stride2 = width; + if (isRandom) { + GenRandomData(width, height, stride, stride2, BD); + } else { + const int msb = BD; + const uint16_t limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src1_, stride2, src2_, limit, BD); + } else { + GenExtremeData(width, height, stride, src1_, stride2, src2_, 0, BD); + } + } + double sigma[1] = { 2.1002103677063437 }; + DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]); + DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]); + memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0])); + memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0])); + DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]); + DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]); + memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0])); + memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0])); + + assert(width == 32 && height == 32); + const BLOCK_SIZE block_size = BLOCK_32X32; + const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } }; + const int subblock_mses[4] = { 15, 16, 17, 18 }; + const int q_factor = 12; + const int filter_strength = 5; + const int mb_row = 0; + const int mb_col = 0; + const int num_planes = 1; + YV12_BUFFER_CONFIG *ref_frame = + (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG)); + ref_frame->y_crop_height = 360; + ref_frame->y_crop_width = 540; + ref_frame->heights[0] = height; + ref_frame->strides[0] = stride; + DECLARE_ALIGNED(16, uint16_t, src[1024 * 3]); + ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src); + ref_frame->buffers[0] = ref_frame->buffer_alloc; + ref_frame->flags = YV12_FLAG_HIGHBITDEPTH; // Only Hihgbd bit-depth test. + memcpy(src, src1_, 1024 * 3 * sizeof(uint16_t)); + + MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD)); + mbd->plane[0].subsampling_y = 0; + mbd->plane[0].subsampling_x = 0; + mbd->bd = BD; + + params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, subblock_mvs, subblock_mses, q_factor, + filter_strength, CONVERT_TO_BYTEPTR(src2_), + accumulator_ref, count_ref); + params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, subblock_mvs, subblock_mses, q_factor, + filter_strength, CONVERT_TO_BYTEPTR(src2_), + accumulator_mod, count_mod); + + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, subblock_mvs, subblock_mses, q_factor, + filter_strength, CONVERT_TO_BYTEPTR(src2_), + accumulator_ref, count_ref); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes, + sigma, subblock_mvs, subblock_mses, q_factor, + filter_strength, CONVERT_TO_BYTEPTR(src2_), + accumulator_mod, count_mod); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%f\t width=%d\t height=%d \n", + elapsed_time_c, elapsed_time_simd, + (float)((float)elapsed_time_c / (float)elapsed_time_simd), width, + height); + + } else { + for (int i = 0, l = 0; i < height; i++) { + for (int j = 0; j < width; j++, l++) { + EXPECT_EQ(accumulator_ref[l], accumulator_mod[l]) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C accumulator does not match optimized accumulator."; + EXPECT_EQ(count_ref[l], count_mod[l]) + << "Error:" << k << " SSE Sum Test [" << width << "x" << height + << "] C count does not match optimized count."; + } + } + } + + free(ref_frame); + free(mbd); + } +} + +TEST_P(HBDTemporalFilterTest, OperationCheck) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(1, height, height, 1, 10); // GenRandomData + } +} + +TEST_P(HBDTemporalFilterTest, ExtremeValues) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(0, height, height, 1, 10); + } +} + +TEST_P(HBDTemporalFilterTest, DISABLED_Speed) { + for (int height = 32; height <= 32; height = height * 2) { + RunTest(1, height, height, 100000, 10); + } +} +#if HAVE_SSE2 +HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = { + HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c, + &av1_highbd_apply_temporal_filter_sse2) +}; +INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest, + Combine(ValuesIn(HBDtemporal_filter_test_sse2), + Range(64, 65, 4))); +#endif // HAVE_SSE2 +#endif // CONFIG_AV1_HIGHBITDEPTH } // namespace #endif