|  | /* | 
|  | * Copyright (c) 2025, Alliance for Open Media. All rights reserved | 
|  | * | 
|  | * This source code is subject to the terms of the BSD 3-Clause Clear License | 
|  | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear | 
|  | * License was not distributed with this source code in the LICENSE file, you | 
|  | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the | 
|  | * Alliance for Open Media Patent License 1.0 was not distributed with this | 
|  | * source code in the PATENTS file, you can obtain it at | 
|  | * aomedia.org/license/patent-license/. | 
|  | */ | 
|  |  | 
|  | #include <math.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  |  | 
|  | #include "third_party/googletest/src/googletest/include/gtest/gtest.h" | 
|  | #include "test/register_state_check.h" | 
|  | #include "test/function_equivalence_test.h" | 
|  |  | 
|  | #include "config/aom_config.h" | 
|  | #include "config/aom_dsp_rtcd.h" | 
|  | #include "config/av1_rtcd.h" | 
|  |  | 
|  | #include "aom/aom_integer.h" | 
|  | #include "aom_ports/aom_timer.h" | 
|  | #include "av1/common/enums.h" | 
|  | #include "av1/common/intra_dip.h" | 
|  | #include "av1/common/intra_matrix.h" | 
|  |  | 
|  | using libaom_test::FunctionEquivalenceTest; | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | template <typename F, typename T> | 
|  | class IntraMatrixTest : public FunctionEquivalenceTest<F> { | 
|  | protected: | 
|  | static const int kIterations = 1000000; | 
|  | static const int kBufSize = 8 * 8; | 
|  |  | 
|  | virtual ~IntraMatrixTest() {} | 
|  |  | 
|  | virtual void Execute(T *dip_tst) = 0; | 
|  |  | 
|  | void Common() { | 
|  | dip_ref_ = &dip_ref_data_[0]; | 
|  | dip_tst_ = &dip_tst_data_[0]; | 
|  |  | 
|  | Execute(dip_tst_); | 
|  |  | 
|  | for (int r = 0; r < kBufSize; ++r) { | 
|  | ASSERT_EQ(dip_ref_[r], dip_tst_[r]); | 
|  | } | 
|  | } | 
|  |  | 
|  | T dip_arr_[DIP_ROWS * DIP_COLS]; | 
|  | T dip_feat_[DIP_COLS]; | 
|  |  | 
|  | T dip_ref_data_[kBufSize]; | 
|  | T dip_tst_data_[kBufSize]; | 
|  |  | 
|  | T *dip_ref_; | 
|  | T *dip_tst_; | 
|  | }; | 
|  |  | 
|  | ////////////////////////////////////////////////////////////////////////////// | 
|  | // High bit-depth version | 
|  | ////////////////////////////////////////////////////////////////////////////// | 
|  |  | 
|  | typedef void (*IMHB)(const uint16_t *A, const uint16_t *B, uint16_t *C, int bd); | 
|  | typedef libaom_test::FuncParam<IMHB> IntraMatrixTestFuncsHBD; | 
|  |  | 
|  | class IntraMatrixTestHB : public IntraMatrixTest<IMHB, uint16_t> { | 
|  | protected: | 
|  | void Execute(uint16_t *dip_tst) { | 
|  | params_.ref_func(dip_arr_, dip_feat_, dip_ref_, bit_depth_); | 
|  | ASM_REGISTER_STATE_CHECK( | 
|  | params_.tst_func(dip_arr_, dip_feat_, dip_tst, bit_depth_)); | 
|  | } | 
|  | int bit_depth_; | 
|  | }; | 
|  |  | 
|  | TEST_P(IntraMatrixTestHB, RandomValues) { | 
|  | for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { | 
|  | switch (rng_(3)) { | 
|  | case 0: bit_depth_ = 8; break; | 
|  | case 1: bit_depth_ = 10; break; | 
|  | default: bit_depth_ = 12; break; | 
|  | } | 
|  | const int hi = 1 << bit_depth_; | 
|  |  | 
|  | for (int i = 0; i < 16; ++i) { | 
|  | dip_feat_[i] = rng_(hi); | 
|  | } | 
|  | int mode = iter % INTRA_DIP_MODE_CNT; | 
|  | for (int r = 0; r < DIP_ROWS; ++r) { | 
|  | for (int c = 0; c < DIP_FEATURES; ++c) { | 
|  | dip_arr_[r * DIP_COLS + c] = av1_intra_matrix_weights[mode][r][c]; | 
|  | } | 
|  | } | 
|  |  | 
|  | Common(); | 
|  | } | 
|  | } | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P(AVX2, IntraMatrixTestHB, | 
|  | ::testing::Values(IntraMatrixTestFuncsHBD( | 
|  | av1_dip_matrix_multiplication_c, | 
|  | av1_dip_matrix_multiplication_avx2))); | 
|  | #endif  // HAVE_AVX2 | 
|  |  | 
|  | // Speed tests | 
|  |  | 
|  | TEST_P(IntraMatrixTestHB, DISABLED_Speed) { | 
|  | const int test_count = 10000000; | 
|  | bit_depth_ = 12; | 
|  | const int hi = 1 << bit_depth_; | 
|  | for (int i = 0; i < 16; ++i) { | 
|  | dip_feat_[i] = rng_(hi); | 
|  | } | 
|  | for (int r = 0; r < 64; ++r) { | 
|  | for (int c = 0; c < 11; ++c) { | 
|  | dip_arr_[r * 16 + c] = av1_intra_matrix_weights[0][r][c]; | 
|  | } | 
|  | } | 
|  | dip_tst_ = &dip_tst_data_[0]; | 
|  | for (int iter = 0; iter < test_count; ++iter) { | 
|  | ASM_REGISTER_STATE_CHECK( | 
|  | params_.tst_func(dip_arr_, dip_feat_, dip_tst_, bit_depth_)); | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | ////////////////////////////////////////////////////////////////////////////// | 
|  | // ResampleOutputTest | 
|  | ////////////////////////////////////////////////////////////////////////////// | 
|  |  | 
|  | typedef void (*ResampleOutputFunc)(uint16_t *dst, int dst_stride, | 
|  | const uint16_t *above_row, | 
|  | const uint16_t *left_col, | 
|  | uint16_t *ml_output, int bw_log2, | 
|  | int bh_log2, int transpose); | 
|  |  | 
|  | typedef libaom_test::FuncParam<ResampleOutputFunc> ResampleOutputTestFuncs; | 
|  |  | 
|  | class ResampleOutputTest : public FunctionEquivalenceTest<ResampleOutputFunc> { | 
|  | protected: | 
|  | static const int kMaxWidth = 64; | 
|  | static const int kMaxHeight = 64; | 
|  | static const int kBufSize = kMaxWidth * kMaxHeight; | 
|  | static const int kMlOutputSize = 8 * 8; | 
|  | static const int kContextSize = kMaxWidth + kMaxHeight + 1; | 
|  |  | 
|  | ResampleOutputTest() { | 
|  | dst_ref_ = &dst_ref_data_[0]; | 
|  | dst_tst_ = &dst_tst_data_[0]; | 
|  | above_row_ = &context_data_[1]; | 
|  | left_col_ = &context_data_[kMaxWidth + 2]; | 
|  | ml_output_ = &ml_output_data_[0]; | 
|  | } | 
|  |  | 
|  | virtual ~ResampleOutputTest() {} | 
|  |  | 
|  | int get_log2(int val) { | 
|  | switch (val) { | 
|  | case 4: return 2; | 
|  | case 8: return 3; | 
|  | case 16: return 4; | 
|  | case 32: return 5; | 
|  | case 64: return 6; | 
|  | default: EXPECT_TRUE(false) << "Invalid block size"; return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | void RunCorrectnessTest() { | 
|  | const int block_sizes[] = { 8, 16, 32, 64 }; | 
|  | for (int bw : block_sizes) { | 
|  | for (int bh : block_sizes) { | 
|  | // Data-driven intra prediction only applies to blocks with w*h >= 128. | 
|  | if (bw * bh < 128) continue; | 
|  | for (int transpose = 0; transpose < 2; ++transpose) { | 
|  | const int bw_log2 = get_log2(bw); | 
|  | const int bh_log2 = get_log2(bh); | 
|  | const int dst_stride = kMaxWidth; | 
|  | const int bit_depth = 12; | 
|  | const int hi = (1 << bit_depth) - 1; | 
|  |  | 
|  | for (int i = 0; i < kContextSize; ++i) { | 
|  | context_data_[i] = rng_(hi); | 
|  | } | 
|  | for (int i = 0; i < kMlOutputSize; ++i) { | 
|  | // The range of ml_output is clipped to the corresponding bitdepth. | 
|  | // i.e. v = clip_pixel_highbd(v, bit_depth); | 
|  | // See av1_dip_matrix_mulplication. | 
|  | ml_output_data_[i] = rng_(hi); | 
|  | } | 
|  | // The top-left corner is shared between above_row[-1] and | 
|  | // left_col[-1] | 
|  | above_row_[-1] = context_data_[0]; | 
|  | left_col_[-1] = context_data_[0]; | 
|  |  | 
|  | params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_, | 
|  | ml_output_, bw_log2, bh_log2, transpose); | 
|  | ASM_REGISTER_STATE_CHECK( | 
|  | params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_, | 
|  | ml_output_, bw_log2, bh_log2, transpose)); | 
|  |  | 
|  | for (int r = 0; r < bh; ++r) { | 
|  | for (int c = 0; c < bw; ++c) { | 
|  | ASSERT_EQ(dst_ref_[r * dst_stride + c], | 
|  | dst_tst_[r * dst_stride + c]) | 
|  | << "Mismatch at (" << c << ", " << r << ") for block size " | 
|  | << bw << "x" << bh << " (transpose=" << transpose << ")"; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void RunSpeedTest() { | 
|  | const int block_sizes[] = { 8, 16, 32, 64 }; | 
|  | for (int bw : block_sizes) { | 
|  | for (int bh : block_sizes) { | 
|  | // Data-driven intra prediction only applies to blocks with w*h >= 128. | 
|  | if (bw * bh < 128) continue; | 
|  | for (int transpose = 0; transpose < 2; ++transpose) { | 
|  | const int bw_log2 = get_log2(bw); | 
|  | const int bh_log2 = get_log2(bh); | 
|  | const int dst_stride = kMaxWidth; | 
|  | const int bit_depth = 12; | 
|  | const int hi = (1 << bit_depth) - 1; | 
|  | const int kIterations = 100000; | 
|  |  | 
|  | for (int i = 0; i < kContextSize; ++i) { | 
|  | context_data_[i] = rng_(hi); | 
|  | } | 
|  | for (int i = 0; i < kMlOutputSize; ++i) { | 
|  | // The range of ml_output is clipped to the corresponding bitdepth. | 
|  | // i.e. v = clip_pixel_highbd(v, bit_depth); | 
|  | // See av1_dip_matrix_mulplication. | 
|  | ml_output_data_[i] = rng_(hi); | 
|  | } | 
|  | above_row_[-1] = context_data_[0]; | 
|  | left_col_[-1] = context_data_[0]; | 
|  |  | 
|  | aom_usec_timer ref_timer, tst_timer; | 
|  |  | 
|  | aom_usec_timer_start(&ref_timer); | 
|  | for (int i = 0; i < kIterations; ++i) { | 
|  | params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_, | 
|  | ml_output_, bw_log2, bh_log2, transpose); | 
|  | } | 
|  | aom_usec_timer_mark(&ref_timer); | 
|  | const double ref_time = | 
|  | static_cast<double>(aom_usec_timer_elapsed(&ref_timer)); | 
|  |  | 
|  | aom_usec_timer_start(&tst_timer); | 
|  | for (int i = 0; i < kIterations; ++i) { | 
|  | params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_, | 
|  | ml_output_, bw_log2, bh_log2, transpose); | 
|  | } | 
|  | aom_usec_timer_mark(&tst_timer); | 
|  | const double tst_time = | 
|  | static_cast<double>(aom_usec_timer_elapsed(&tst_timer)); | 
|  |  | 
|  | printf( | 
|  | "Block %2dx%2d (T=%d): C time = %7.2f us, SIMD time = %7.2f us, " | 
|  | "Speedup = %4.2fx\n", | 
|  | bw, bh, transpose, ref_time, tst_time, ref_time / tst_time); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | uint16_t dst_ref_data_[kBufSize]; | 
|  | uint16_t dst_tst_data_[kBufSize]; | 
|  | uint16_t context_data_[kContextSize]; | 
|  | uint16_t ml_output_data_[kMlOutputSize]; | 
|  |  | 
|  | uint16_t *dst_ref_; | 
|  | uint16_t *dst_tst_; | 
|  | uint16_t *above_row_; | 
|  | uint16_t *left_col_; | 
|  | uint16_t *ml_output_; | 
|  | }; | 
|  |  | 
|  | TEST_P(ResampleOutputTest, Correctness) { RunCorrectnessTest(); } | 
|  |  | 
|  | TEST_P(ResampleOutputTest, DISABLED_Speed) { RunSpeedTest(); } | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P(AVX2, ResampleOutputTest, | 
|  | ::testing::Values(ResampleOutputTestFuncs( | 
|  | resample_output_c, resample_output_avx2))); | 
|  | #endif  // HAVE_AVX2 |