|  | /* | 
|  | * Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
|  | * | 
|  | * This source code is subject to the terms of the BSD 3-Clause Clear License | 
|  | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear | 
|  | * License was not distributed with this source code in the LICENSE file, you | 
|  | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the | 
|  | * Alliance for Open Media Patent License 1.0 was not distributed with this | 
|  | * source code in the PATENTS file, you can obtain it at | 
|  | * aomedia.org/license/patent-license/. | 
|  | */ | 
|  |  | 
|  | #include <set> | 
|  | #include <vector> | 
|  | #include "config/av1_rtcd.h" | 
|  | #include "config/aom_dsp_rtcd.h" | 
|  | #include "test/acm_random.h" | 
|  | #include "test/clear_system_state.h" | 
|  | #include "third_party/googletest/src/googletest/include/gtest/gtest.h" | 
|  |  | 
|  | #include "aom_ports/aom_timer.h" | 
|  | #include "av1/common/reconinter.h" | 
|  | #include "av1/common/mvref_common.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | class BlockSize { | 
|  | public: | 
|  | BlockSize(int w, int h) : width_(w), height_(h) { | 
|  | n_ = (w <= 8 && h <= 8) ? OF_MIN_BSIZE : OF_BSIZE; | 
|  | } | 
|  |  | 
|  | int Width() const { return width_; } | 
|  | int Height() const { return height_; } | 
|  | int OptFlowBlkSize() const { return n_; } | 
|  |  | 
|  | bool operator<(const BlockSize &other) const { | 
|  | if (Width() == other.Width()) { | 
|  | return Height() < other.Height(); | 
|  | } | 
|  | return Width() < other.Width(); | 
|  | } | 
|  |  | 
|  | bool operator==(const BlockSize &other) const { | 
|  | return Width() == other.Width() && Height() == other.Height(); | 
|  | } | 
|  |  | 
|  | private: | 
|  | int width_; | 
|  | int height_; | 
|  | int n_; | 
|  | }; | 
|  |  | 
|  | // Block size / bit depth / test function used to parameterize the tests. | 
|  | template <typename T> | 
|  | class TestParam { | 
|  | public: | 
|  | TestParam(const BlockSize &block, int bd, T test_func) | 
|  | : block_(block), bd_(bd), test_func_(test_func) {} | 
|  |  | 
|  | const BlockSize &Block() const { return block_; } | 
|  | int BitDepth() const { return bd_; } | 
|  | T TestFunction() const { return test_func_; } | 
|  |  | 
|  | bool operator==(const TestParam &other) const { | 
|  | return Block() == other.Block() && BitDepth() == other.BitDepth() && | 
|  | TestFunction() == other.TestFunction(); | 
|  | } | 
|  |  | 
|  | private: | 
|  | BlockSize block_; | 
|  | int bd_; | 
|  | T test_func_; | 
|  | }; | 
|  |  | 
|  | template <typename T> | 
|  | std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) { | 
|  | return os << "TestParam { width:" << test_arg.Block().Width() | 
|  | << " height:" << test_arg.Block().Height() | 
|  | << " bd:" << test_arg.BitDepth() << " }"; | 
|  | } | 
|  |  | 
|  | // AV1OptFlowTest is the base class that all optical flow tests should derive | 
|  | // from. | 
|  | template <typename T> | 
|  | class AV1OptFlowTest : public ::testing::TestWithParam<TestParam<T>> { | 
|  | public: | 
|  | virtual ~AV1OptFlowTest() { TearDown(); } | 
|  |  | 
|  | virtual void SetUp() override { | 
|  | rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); | 
|  | } | 
|  |  | 
|  | virtual void TearDown() override { libaom_test::ClearSystemState(); } | 
|  |  | 
|  | // Check that two 8-bit output buffers are identical. | 
|  | void AssertOutputEq(const int *ref, const int *test, int n) { | 
|  | ASSERT_TRUE(ref != test) << "Buffers must be at different memory locations"; | 
|  | for (int idx = 0; idx < n; ++idx) { | 
|  | ASSERT_EQ(ref[idx], test[idx]) << "Mismatch at index " << idx; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Check that two 16-bit output buffers are identical. | 
|  | void AssertOutputBufferEq(const int16_t *ref, const int16_t *test, int width, | 
|  | int height, int stride) { | 
|  | ASSERT_TRUE(ref != test) << "Buffers must be in different memory locations"; | 
|  | for (int row = 0; row < height; ++row) { | 
|  | for (int col = 0; col < width; ++col) { | 
|  | ASSERT_EQ(ref[row * stride + col], test[row * stride + col]) | 
|  | << width << "x" << height << " Pixel mismatch at (" << col << ", " | 
|  | << row << ")"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | uint8_t RandomFrameIdx(int max_bit_range) { | 
|  | const int max_val = (1 << max_bit_range) - 1; | 
|  | uint8_t rand_val = rnd_.Rand8() & max_val; | 
|  | return rand_val; | 
|  | } | 
|  |  | 
|  | int8_t RelativeDistExtreme(int max_bit_range) { | 
|  | return Rand8SingedExtremes(max_bit_range); | 
|  | } | 
|  |  | 
|  | void RandomInput8(uint8_t *p, const TestParam<T> ¶m) { | 
|  | EXPECT_EQ(8, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | Randomize(p, bw * bh); | 
|  | } | 
|  |  | 
|  | void Randomize9Signed(int16_t *p, int size) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = rnd_.Rand9Signed(); | 
|  | } | 
|  | } | 
|  |  | 
|  | void RandomInput9(int16_t *p, const TestParam<T> ¶m) { | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | Randomize9Signed(p, bw * bh); | 
|  | } | 
|  |  | 
|  | void RandomInput16(uint16_t *p, const TestParam<T> ¶m, | 
|  | int max_bit_range) { | 
|  | EXPECT_GE(12, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | Randomize(p, bw * bh, max_bit_range); | 
|  | } | 
|  |  | 
|  | void RandomInput16(int16_t *p, const TestParam<T> ¶m, int max_bit_range) { | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | Randomize(p, bw * bh, max_bit_range); | 
|  | } | 
|  |  | 
|  | void RandomInput8Extreme(uint8_t *p, const TestParam<T> ¶m) { | 
|  | EXPECT_EQ(8, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | RandomizeExtreme(p, bw * bh); | 
|  | } | 
|  |  | 
|  | void RandomInput9Extreme(int16_t *p, const TestParam<T> ¶m, | 
|  | int max_bit_range) { | 
|  | EXPECT_GE(12, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | Randomize9Extreme(p, bw * bh, max_bit_range); | 
|  | } | 
|  |  | 
|  | void RandomInput16Extreme(uint16_t *p, const TestParam<T> ¶m, | 
|  | int max_bit_range) { | 
|  | EXPECT_GE(12, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | RandomizeExtreme(p, bw * bh, max_bit_range); | 
|  | } | 
|  |  | 
|  | void RandomInput16Extreme(int16_t *p, const TestParam<T> ¶m, | 
|  | int max_bit_range) { | 
|  | EXPECT_GE(12, param.BitDepth()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
|  | EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
|  | const int bw = param.Block().Width(); | 
|  | const int bh = param.Block().Height(); | 
|  | RandomizeExtreme(p, bw * bh, max_bit_range); | 
|  | } | 
|  |  | 
|  | private: | 
|  | void Randomize(uint8_t *p, int size) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = rnd_.Rand8(); | 
|  | } | 
|  | } | 
|  |  | 
|  | void Randomize(uint16_t *p, int size, int max_bit_range) { | 
|  | assert(max_bit_range <= 16); | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = rnd_.Rand16() & ((1 << max_bit_range) - 1); | 
|  | } | 
|  | } | 
|  |  | 
|  | void Randomize(int16_t *p, int size, int max_bit_range) { | 
|  | assert(max_bit_range <= 16); | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = (rnd_.Rand16() & ((1 << max_bit_range) - 1)) - | 
|  | (1 << (max_bit_range - 1)); | 
|  | } | 
|  | } | 
|  |  | 
|  | int RandBool() { | 
|  | const uint32_t value = rnd_.Rand8(); | 
|  | // There's a bit more entropy in the upper bits of this implementation. | 
|  | return (value >> 7) & 0x1; | 
|  | } | 
|  |  | 
|  | uint8_t Rand8Extremes() { return static_cast<uint8_t>(RandBool() ? 255 : 0); } | 
|  |  | 
|  | int8_t Rand8SingedExtremes(int max_bit_range) { | 
|  | const int max_val = (1 << max_bit_range) - 1; | 
|  | const int half_max_val = 1 << (max_bit_range - 1); | 
|  | uint8_t r_u8 = Rand8Extremes() & max_val; | 
|  | return static_cast<int8_t>(r_u8 - half_max_val); | 
|  | } | 
|  |  | 
|  | int16_t Rand9SingedExtremes(int max_bit_range) { | 
|  | const int half_max_val = 1 << (max_bit_range - 1); | 
|  | uint16_t r_u16 = Rand16Extremes(max_bit_range); | 
|  | return static_cast<int16_t>(r_u16 - half_max_val); | 
|  | } | 
|  |  | 
|  | uint16_t Rand16Extremes(int max_bit_range) { | 
|  | const int max_val = (1 << max_bit_range) - 1; | 
|  | return static_cast<uint16_t>(RandBool() ? max_val : 0); | 
|  | } | 
|  |  | 
|  | int16_t Rand16SingedExtremes(int max_bit_range) { | 
|  | const int half_max_val = 1 << (max_bit_range - 1); | 
|  | uint16_t r_u16 = Rand16Extremes(max_bit_range); | 
|  | return static_cast<int16_t>(r_u16 - half_max_val); | 
|  | } | 
|  |  | 
|  | void RandomizeExtreme(uint8_t *p, int size) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = Rand8Extremes(); | 
|  | } | 
|  | } | 
|  |  | 
|  | void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = Rand16Extremes(max_bit_range); | 
|  | } | 
|  | } | 
|  |  | 
|  | void RandomizeExtreme(int16_t *p, int size, int max_bit_range) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = Rand16SingedExtremes(max_bit_range); | 
|  | } | 
|  | } | 
|  |  | 
|  | void Randomize9Extreme(int16_t *p, int size, int max_bit_range) { | 
|  | for (int i = 0; i < size; ++i) { | 
|  | p[i] = Rand9SingedExtremes(max_bit_range); | 
|  | } | 
|  | } | 
|  |  | 
|  | libaom_test::ACMRandom rnd_; | 
|  | }; | 
|  |  | 
|  | // a function to generate test parameters for just luma block sizes. | 
|  | template <typename T> | 
|  | std::vector<TestParam<T>> GetOptFlowTestParams( | 
|  | std::initializer_list<int> bit_depths, T test_func) { | 
|  | std::set<BlockSize> sizes; | 
|  | for (int bsize = BLOCK_8X8; bsize < BLOCK_SIZES_ALL; ++bsize) { | 
|  | const int w = block_size_wide[bsize]; | 
|  | const int h = block_size_high[bsize]; | 
|  | if (w < 8 || h < 8) continue; | 
|  | sizes.insert(BlockSize(w, h)); | 
|  | } | 
|  | std::vector<TestParam<T>> result; | 
|  | for (int bit_depth : bit_depths) { | 
|  | for (const auto &block : sizes) { | 
|  | result.push_back(TestParam<T>(block, bit_depth, test_func)); | 
|  | } | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | std::vector<TestParam<T>> GetOptFlowHighbdTestParams(T test_func) { | 
|  | return GetOptFlowTestParams({ 8, 10, 12 }, test_func); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | ::testing::internal::ParamGenerator<TestParam<T>> BuildOptFlowHighbdParams( | 
|  | T test_func) { | 
|  | return ::testing::ValuesIn(GetOptFlowHighbdTestParams(test_func)); | 
|  | } | 
|  |  | 
|  | #if OPFL_BICUBIC_GRAD | 
|  | typedef void (*bicubic_grad_interp_highbd)(const int16_t *pred_src, | 
|  | int16_t *x_grad, int16_t *y_grad, | 
|  | const int blk_width, | 
|  | const int blk_height); | 
|  |  | 
|  | class AV1OptFlowBiCubicGradHighbdTest | 
|  | : public AV1OptFlowTest<bicubic_grad_interp_highbd> { | 
|  | public: | 
|  | AV1OptFlowBiCubicGradHighbdTest() { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | pred_src_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | x_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | y_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | x_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | y_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  |  | 
|  | memset(x_grad_ref_, 0, bw * bh * sizeof(int16_t)); | 
|  | memset(y_grad_ref_, 0, bw * bh * sizeof(int16_t)); | 
|  | memset(x_grad_test_, 0, bw * bh * sizeof(int16_t)); | 
|  | memset(y_grad_test_, 0, bw * bh * sizeof(int16_t)); | 
|  | } | 
|  |  | 
|  | ~AV1OptFlowBiCubicGradHighbdTest() { | 
|  | aom_free(pred_src_); | 
|  | aom_free(x_grad_ref_); | 
|  | aom_free(y_grad_ref_); | 
|  | aom_free(x_grad_test_); | 
|  | aom_free(y_grad_test_); | 
|  | } | 
|  |  | 
|  | void Run(const int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bd = GetParam().BitDepth(); | 
|  | const int bw_log2 = block.Width() >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = block.Height() >> MI_SIZE_LOG2; | 
|  | const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); | 
|  |  | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | RandomInput16(pred_src_, GetParam(), bd); | 
|  | TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_, | 
|  | y_grad_test_, is_speed); | 
|  | } | 
|  | if (is_speed) return; | 
|  |  | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | RandomInput16Extreme((uint16_t *)pred_src_, GetParam(), bd); | 
|  | TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_, | 
|  | y_grad_test_, 0); | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | void TestBicubicGradHighbd(int16_t *pred_src, int16_t *x_grad_ref, | 
|  | int16_t *y_grad_ref, int16_t *x_grad_test, | 
|  | int16_t *y_grad_test, int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | bicubic_grad_interp_highbd ref_func = | 
|  | av1_bicubic_grad_interpolation_highbd_c; | 
|  | bicubic_grad_interp_highbd test_func = GetParam().TestFunction(); | 
|  | if (is_speed) | 
|  | BicubicGradHighbdSpeed(ref_func, test_func, pred_src, x_grad_ref, | 
|  | y_grad_ref, x_grad_test, y_grad_test, bw, bh); | 
|  | else | 
|  | BicubicGradHighbd(ref_func, test_func, pred_src, x_grad_ref, y_grad_ref, | 
|  | x_grad_test, y_grad_test, bw, bh); | 
|  | } | 
|  |  | 
|  | void BicubicGradHighbd(bicubic_grad_interp_highbd ref_func, | 
|  | bicubic_grad_interp_highbd test_func, | 
|  | const int16_t *pred_src, int16_t *x_grad_ref, | 
|  | int16_t *y_grad_ref, int16_t *x_grad_test, | 
|  | int16_t *y_grad_test, const int bw, const int bh) { | 
|  | ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh); | 
|  | test_func(pred_src, x_grad_test, y_grad_test, bw, bh); | 
|  |  | 
|  | AssertOutputBufferEq(x_grad_ref, x_grad_test, bw, bh, bw); | 
|  | AssertOutputBufferEq(y_grad_ref, y_grad_test, bw, bh, bw); | 
|  | } | 
|  |  | 
|  | void BicubicGradHighbdSpeed(bicubic_grad_interp_highbd ref_func, | 
|  | bicubic_grad_interp_highbd test_func, | 
|  | int16_t *pred_src, int16_t *x_grad_ref, | 
|  | int16_t *y_grad_ref, int16_t *x_grad_test, | 
|  | int16_t *y_grad_test, const int bw, | 
|  | const int bh) { | 
|  | const int bw_log2 = bw >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = bh >> MI_SIZE_LOG2; | 
|  |  | 
|  | const int numIter = 2097152 / (bw_log2 * bh_log2); | 
|  | aom_usec_timer timer_ref; | 
|  | aom_usec_timer timer_test; | 
|  |  | 
|  | aom_usec_timer_start(&timer_ref); | 
|  | for (int count = 0; count < numIter; count++) | 
|  | ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh); | 
|  | aom_usec_timer_mark(&timer_ref); | 
|  |  | 
|  | aom_usec_timer_start(&timer_test); | 
|  | for (int count = 0; count < numIter; count++) | 
|  | test_func(pred_src, x_grad_test, y_grad_test, bw, bh); | 
|  | aom_usec_timer_mark(&timer_test); | 
|  |  | 
|  | const int total_time_ref = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); | 
|  | const int total_time_test = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_test)); | 
|  |  | 
|  | printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, | 
|  | total_time_test, | 
|  | (static_cast<float>(total_time_ref) / | 
|  | static_cast<float>(total_time_test))); | 
|  | } | 
|  |  | 
|  | int16_t *pred_src_; | 
|  | int16_t *x_grad_ref_; | 
|  | int16_t *y_grad_ref_; | 
|  | int16_t *x_grad_test_; | 
|  | int16_t *y_grad_test_; | 
|  | }; | 
|  | TEST_P(AV1OptFlowBiCubicGradHighbdTest, CheckOutput) { Run(0); } | 
|  | TEST_P(AV1OptFlowBiCubicGradHighbdTest, DISABLED_Speed) { Run(1); } | 
|  |  | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | C, AV1OptFlowBiCubicGradHighbdTest, | 
|  | BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_c)); | 
|  |  | 
|  | #if HAVE_SSE4_1 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | SSE4_1, AV1OptFlowBiCubicGradHighbdTest, | 
|  | BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_sse4_1)); | 
|  | #endif | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | AVX2, AV1OptFlowBiCubicGradHighbdTest, | 
|  | BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_avx2)); | 
|  | #endif | 
|  | #endif  // OPFL_BICUBIC_GRAD | 
|  |  | 
|  | typedef int (*opfl_mv_refinement)(const int16_t *pdiff, int pstride, | 
|  | const int16_t *gx, const int16_t *gy, | 
|  | int gstride, int bw, int bh, int n, int d0, | 
|  | int d1, int grad_prec_bits, int mv_prec_bits, | 
|  | #if CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | int mi_x, int mi_y, int mi_cols, int mi_rows, | 
|  | int build_for_decode, | 
|  | #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | int *vx0, int *vy0, int *vx1, int *vy1); | 
|  |  | 
|  | class AV1OptFlowRefineTest : public AV1OptFlowTest<opfl_mv_refinement> { | 
|  | public: | 
|  | AV1OptFlowRefineTest() { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | input_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*input_)); | 
|  | gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gx_)); | 
|  | gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gy_)); | 
|  | } | 
|  |  | 
|  | ~AV1OptFlowRefineTest() { | 
|  | aom_free(input_); | 
|  | aom_free(gx_); | 
|  | aom_free(gy_); | 
|  | } | 
|  |  | 
|  | void RunTest(const int is_speed) { | 
|  | OrderHintInfo oh_info; | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bd = GetParam().BitDepth(); | 
|  | const int bw_log2 = block.Width() >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = block.Height() >> MI_SIZE_LOG2; | 
|  | const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); | 
|  | const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1; | 
|  |  | 
|  | oh_info.enable_order_hint = 1; | 
|  | for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) { | 
|  | for (int count = 0; count < numIter;) { | 
|  | const int cur_frm_idx = RandomFrameIdx(oh_bits); | 
|  | const int ref0_frm_idx = RandomFrameIdx(oh_bits); | 
|  | const int ref1_frm_idx = RandomFrameIdx(oh_bits); | 
|  |  | 
|  | oh_info.order_hint_bits_minus_1 = oh_bits - 1; | 
|  | const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx); | 
|  | const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx); | 
|  | if (!d0 || !d1) continue; | 
|  |  | 
|  | // Here, the input corresponds to 'd0*p0 - d1*p1' (where P0 and P1 can | 
|  | // be 12 bits, d0 and d1 can be >=5 bits) and gx, gy are gradients of | 
|  | // input. Due to the clamping of these value to [INT16_MIN, INT16_MAX], | 
|  | // testing of the same is required. Hence, populating the input_, gx_ | 
|  | // and gy_ buffers as per the requirement. | 
|  | RandomInput16(input_, GetParam(), AOMMIN(16, bd + 1)); | 
|  | RandomInput16(gx_, GetParam(), AOMMIN(16, bd + 6)); | 
|  | RandomInput16(gy_, GetParam(), AOMMIN(16, bd + 6)); | 
|  |  | 
|  | TestOptFlowRefine(input_, gx_, gy_, is_speed, d0, d1); | 
|  | count++; | 
|  | } | 
|  | } | 
|  | if (is_speed) return; | 
|  |  | 
|  | // Extreme value test | 
|  | for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; | 
|  | oh_bits += kMaxOrderHintBits - 1) { | 
|  | for (int count = 0; count < numIter;) { | 
|  | const int d0 = RelativeDistExtreme(oh_bits); | 
|  | const int d1 = RelativeDistExtreme(oh_bits); | 
|  | if (!d0 || !d1) continue; | 
|  |  | 
|  | RandomInput16Extreme(input_, GetParam(), AOMMIN(16, bd + 1)); | 
|  | RandomInput16Extreme(gx_, GetParam(), AOMMIN(16, bd + 6)); | 
|  | RandomInput16Extreme(gy_, GetParam(), AOMMIN(16, bd + 6)); | 
|  |  | 
|  | TestOptFlowRefine(input_, gx_, gy_, 0, d0, d1); | 
|  | count++; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | void TestOptFlowRefine(int16_t *input, int16_t *gx, int16_t *gy, | 
|  | const int is_speed, int d0, int d1) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  | const int n = block.OptFlowBlkSize(); | 
|  |  | 
|  | opfl_mv_refinement ref_func = av1_opfl_mv_refinement_nxn_c; | 
|  | opfl_mv_refinement test_func = GetParam().TestFunction(); | 
|  |  | 
|  | if (is_speed) | 
|  | OptFlowRefineSpeed(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1); | 
|  | else | 
|  | OptFlowRefine(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1); | 
|  | } | 
|  |  | 
|  | void OptFlowRefine(opfl_mv_refinement ref_func, opfl_mv_refinement test_func, | 
|  | const int16_t *input, const int16_t *gx, const int16_t *gy, | 
|  | int bw, int bh, int n, int d0, int d1) { | 
|  | int ref_out[4 * N_OF_OFFSETS] = { 0 }; | 
|  | int test_out[4 * N_OF_OFFSETS] = { 0 }; | 
|  | const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; | 
|  | const int mv_prec_bits = MV_REFINE_PREC_BITS; | 
|  | int stride = bw; | 
|  | int gstride = bw; | 
|  |  | 
|  | int n_blocks_ref = ref_func( | 
|  | input, stride, gx, gy, gstride, bw, bh, n, d0, d1, grad_prec_bits, | 
|  | mv_prec_bits, | 
|  | #if CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | 0, 0, 0, 0, 0, | 
|  | #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | &ref_out[kVX_0 * N_OF_OFFSETS], &ref_out[kVY_0 * N_OF_OFFSETS], | 
|  | &ref_out[kVX_1 * N_OF_OFFSETS], &ref_out[kVY_1 * N_OF_OFFSETS]); | 
|  | int n_blocks = test_func( | 
|  | input, stride, gx, gy, gstride, bw, bh, n, d0, d1, grad_prec_bits, | 
|  | mv_prec_bits, | 
|  | #if CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | 0, 0, 0, 0, 0, | 
|  | #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS], | 
|  | &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]); | 
|  |  | 
|  | ASSERT_EQ(n_blocks_ref, n_blocks) << "Mismatch of subblock numbers"; | 
|  | AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS], | 
|  | &test_out[kVX_0 * N_OF_OFFSETS], n_blocks); | 
|  | AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS], | 
|  | &test_out[kVY_0 * N_OF_OFFSETS], n_blocks); | 
|  | AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS], | 
|  | &test_out[kVX_1 * N_OF_OFFSETS], n_blocks); | 
|  | AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS], | 
|  | &test_out[kVY_1 * N_OF_OFFSETS], n_blocks); | 
|  | } | 
|  |  | 
|  | void OptFlowRefineSpeed(opfl_mv_refinement ref_func, | 
|  | opfl_mv_refinement test_func, const int16_t *input, | 
|  | const int16_t *gx, const int16_t *gy, int bw, int bh, | 
|  | int n, int d0, int d1) { | 
|  | int ref_out[4 * N_OF_OFFSETS] = { 0 }; | 
|  | int test_out[4 * N_OF_OFFSETS] = { 0 }; | 
|  | const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; | 
|  | const int mv_prec_bits = MV_REFINE_PREC_BITS; | 
|  | const int bw_log2 = bw >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = bh >> MI_SIZE_LOG2; | 
|  | int stride = bw; | 
|  | int gstride = bw; | 
|  |  | 
|  | const int numIter = 2097152 / (bw_log2 * bh_log2); | 
|  | aom_usec_timer timer_ref; | 
|  | aom_usec_timer timer_test; | 
|  |  | 
|  | aom_usec_timer_start(&timer_ref); | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | ref_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1, | 
|  | grad_prec_bits, mv_prec_bits, | 
|  | #if CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | 0, 0, 0, 0, 0, | 
|  | #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | &ref_out[kVX_0 * N_OF_OFFSETS], &ref_out[kVY_0 * N_OF_OFFSETS], | 
|  | &ref_out[kVX_1 * N_OF_OFFSETS], &ref_out[kVY_1 * N_OF_OFFSETS]); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_ref); | 
|  |  | 
|  | aom_usec_timer_start(&timer_test); | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | test_func( | 
|  | input, stride, gx, gy, gstride, bw, bh, n, d0, d1, grad_prec_bits, | 
|  | mv_prec_bits, | 
|  | #if CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | 0, 0, 0, 0, 0, | 
|  | #endif  // CONFIG_E191_OFS_PRED_RES_HANDLE | 
|  | &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS], | 
|  | &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_test); | 
|  |  | 
|  | const int total_time_ref = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); | 
|  | const int total_time_test = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_test)); | 
|  |  | 
|  | printf( | 
|  | "Block size: %dx%d \t ref_time = %d \t simd_time = %d \t Gain = %4.2f " | 
|  | "\n", | 
|  | bw, bh, total_time_ref, total_time_test, | 
|  | (static_cast<float>(total_time_ref) / | 
|  | static_cast<float>(total_time_test))); | 
|  | } | 
|  |  | 
|  | static constexpr int kVX_0 = 0; | 
|  | static constexpr int kVX_1 = 1; | 
|  | static constexpr int kVY_0 = 2; | 
|  | static constexpr int kVY_1 = 3; | 
|  | static constexpr int kMaxOrderHintBits = 8; | 
|  | static constexpr int kSubpelGradDeltaBits = 3; | 
|  | int16_t *input_; | 
|  | int16_t *gx_; | 
|  | int16_t *gy_; | 
|  | }; | 
|  | TEST_P(AV1OptFlowRefineTest, CheckOutput) { RunTest(0); } | 
|  | TEST_P(AV1OptFlowRefineTest, DISABLED_Speed) { RunTest(1); } | 
|  |  | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | C, AV1OptFlowRefineTest, | 
|  | BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_c)); | 
|  |  | 
|  | #if HAVE_SSE4_1 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | SSE4_1, AV1OptFlowRefineTest, | 
|  | BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_sse4_1)); | 
|  | #endif | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | AVX2, AV1OptFlowRefineTest, | 
|  | BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_avx2)); | 
|  | #endif | 
|  |  | 
|  | #if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD | 
|  | typedef void (*pred_buffer_copy_highbd)(const uint16_t *src1, | 
|  | const uint16_t *src2, int16_t *dst1, | 
|  | int16_t *dst2, int bw, int bh, int d0, | 
|  | int d1, int centered); | 
|  |  | 
|  | class AV1OptFlowCopyPredHighbdTest | 
|  | : public AV1OptFlowTest<pred_buffer_copy_highbd> { | 
|  | public: | 
|  | AV1OptFlowCopyPredHighbdTest() { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | src_buf1_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf1_)); | 
|  | src_buf2_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf2_)); | 
|  | dst_buf1_ref_ = | 
|  | (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_ref_)); | 
|  | dst_buf2_ref_ = | 
|  | (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_ref_)); | 
|  | dst_buf1_test_ = | 
|  | (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_test_)); | 
|  | dst_buf2_test_ = | 
|  | (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_test_)); | 
|  |  | 
|  | memset(dst_buf2_ref_, 0, bw * bh * sizeof(*dst_buf2_ref_)); | 
|  | memset(dst_buf2_test_, 0, bw * bh * sizeof(*dst_buf2_test_)); | 
|  | } | 
|  |  | 
|  | ~AV1OptFlowCopyPredHighbdTest() { | 
|  | aom_free(src_buf1_); | 
|  | aom_free(src_buf2_); | 
|  | aom_free(dst_buf1_ref_); | 
|  | aom_free(dst_buf2_ref_); | 
|  | aom_free(dst_buf1_test_); | 
|  | aom_free(dst_buf2_test_); | 
|  | } | 
|  |  | 
|  | void Run(const int is_speed) { | 
|  | OrderHintInfo oh_info; | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw_log2 = block.Width() >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = block.Height() >> MI_SIZE_LOG2; | 
|  | const int bd = GetParam().BitDepth(); | 
|  | const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); | 
|  | const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1; | 
|  |  | 
|  | oh_info.enable_order_hint = 1; | 
|  | for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) { | 
|  | for (int count = 0; count < numIter;) { | 
|  | const int cur_frm_idx = RandomFrameIdx(oh_bits); | 
|  | const int ref0_frm_idx = RandomFrameIdx(oh_bits); | 
|  | const int ref1_frm_idx = RandomFrameIdx(oh_bits); | 
|  |  | 
|  | oh_info.order_hint_bits_minus_1 = oh_bits - 1; | 
|  | const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx); | 
|  | const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx); | 
|  | if (!d0 || !d1) continue; | 
|  |  | 
|  | RandomInput16(src_buf1_, GetParam(), bd); | 
|  | RandomInput16(src_buf2_, GetParam(), bd); | 
|  | TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_, | 
|  | dst_buf1_test_, dst_buf2_test_, d0, d1, is_speed); | 
|  | count++; | 
|  | } | 
|  | } | 
|  | if (is_speed) return; | 
|  |  | 
|  | // Extreme value test | 
|  | for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; | 
|  | oh_bits += kMaxOrderHintBits - 1) { | 
|  | for (int count = 0; count < numIter;) { | 
|  | const int d0 = RelativeDistExtreme(oh_bits); | 
|  | const int d1 = RelativeDistExtreme(oh_bits); | 
|  | if (!d0 || !d1) continue; | 
|  |  | 
|  | RandomInput16Extreme(src_buf1_, GetParam(), bd); | 
|  | RandomInput16Extreme(src_buf2_, GetParam(), bd); | 
|  | TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_, | 
|  | dst_buf1_test_, dst_buf2_test_, d0, d1, 0); | 
|  | count++; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | void TestCopyPredArray(uint16_t *src_buf1, uint16_t *src_buf2, | 
|  | int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, | 
|  | int16_t *dst_buf1_test, int16_t *dst_buf2_test, int d0, | 
|  | int d1, int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | pred_buffer_copy_highbd ref_func = av1_copy_pred_array_highbd_c; | 
|  | pred_buffer_copy_highbd test_func = GetParam().TestFunction(); | 
|  | if (is_speed) | 
|  | CopyPredArraySpeed(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref, | 
|  | dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, | 
|  | bh); | 
|  | else | 
|  | CopyPredArray(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref, | 
|  | dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, bh); | 
|  | } | 
|  |  | 
|  | void CopyPredArray(pred_buffer_copy_highbd ref_func, | 
|  | pred_buffer_copy_highbd test_func, | 
|  | const uint16_t *src_buf1, uint16_t *src_buf2, | 
|  | int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, | 
|  | int16_t *dst_buf1_test, int16_t *dst_buf2_test, | 
|  | const int d0, const int d1, const int bw, const int bh) { | 
|  | ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1, 0); | 
|  | test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, d1, | 
|  | 0); | 
|  |  | 
|  | AssertOutputBufferEq(dst_buf1_ref, dst_buf1_test, bw, bh, bw); | 
|  | AssertOutputBufferEq(dst_buf2_ref, dst_buf2_test, bw, bh, bw); | 
|  | } | 
|  |  | 
|  | void CopyPredArraySpeed(pred_buffer_copy_highbd ref_func, | 
|  | pred_buffer_copy_highbd test_func, | 
|  | const uint16_t *src_buf1, uint16_t *src_buf2, | 
|  | int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, | 
|  | int16_t *dst_buf1_test, int16_t *dst_buf2_test, | 
|  | const int d0, const int d1, const int bw, | 
|  | const int bh) { | 
|  | const int bw_log2 = bw >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = bh >> MI_SIZE_LOG2; | 
|  | printf("bw=%d, bh=%d\n", bw, bh); | 
|  | const int numIter = 2097152 / (bw_log2 * bh_log2); | 
|  | aom_usec_timer timer_ref; | 
|  | aom_usec_timer timer_test; | 
|  |  | 
|  | aom_usec_timer_start(&timer_ref); | 
|  | for (int count = 0; count < numIter; count++) | 
|  | ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1, | 
|  | 0); | 
|  | aom_usec_timer_mark(&timer_ref); | 
|  |  | 
|  | aom_usec_timer_start(&timer_test); | 
|  | for (int count = 0; count < numIter; count++) | 
|  | test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, | 
|  | d1, 0); | 
|  | aom_usec_timer_mark(&timer_test); | 
|  |  | 
|  | const int total_time_ref = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); | 
|  | const int total_time_test = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_test)); | 
|  |  | 
|  | printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, | 
|  | total_time_test, | 
|  | (static_cast<float>(total_time_ref) / | 
|  | static_cast<float>(total_time_test))); | 
|  | } | 
|  |  | 
|  | uint16_t *src_buf1_; | 
|  | uint16_t *src_buf2_; | 
|  | int16_t *dst_buf1_ref_; | 
|  | int16_t *dst_buf2_ref_; | 
|  | int16_t *dst_buf1_test_; | 
|  | int16_t *dst_buf2_test_; | 
|  | static constexpr int kMaxOrderHintBits = 8; | 
|  | }; | 
|  |  | 
|  | TEST_P(AV1OptFlowCopyPredHighbdTest, CheckOutput) { Run(0); } | 
|  | TEST_P(AV1OptFlowCopyPredHighbdTest, DISABLED_Speed) { Run(1); } | 
|  |  | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | C, AV1OptFlowCopyPredHighbdTest, | 
|  | BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_c)); | 
|  |  | 
|  | #if HAVE_SSE4_1 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | SSE4_1, AV1OptFlowCopyPredHighbdTest, | 
|  | BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_sse4_1)); | 
|  | #endif | 
|  | #endif  // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD | 
|  |  | 
|  | #if CONFIG_AFFINE_REFINEMENT | 
|  | typedef void (*calc_affine_autocorrelation_matrix)( | 
|  | const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy, | 
|  | int gstride, int bw, int bh, | 
|  | #if CONFIG_AFFINE_REFINEMENT_SB | 
|  | int x_offset, int y_offset, | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT_SB | 
|  | int32_t *mat_a, int32_t *vec_b); | 
|  |  | 
|  | class AV1CalcAffineAutocorrelationMatrixTest | 
|  | : public AV1OptFlowTest<calc_affine_autocorrelation_matrix> { | 
|  | public: | 
|  | AV1CalcAffineAutocorrelationMatrixTest() { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | pdiff_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | } | 
|  |  | 
|  | ~AV1CalcAffineAutocorrelationMatrixTest() { | 
|  | aom_free(gx_); | 
|  | aom_free(gy_); | 
|  | aom_free(pdiff_); | 
|  | } | 
|  |  | 
|  | void RunTest(const int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bd = GetParam().BitDepth(); | 
|  | const int bw_log2 = block.Width() >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = block.Height() >> MI_SIZE_LOG2; | 
|  | const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); | 
|  |  | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | RandomInput16(gx_, GetParam(), 16); | 
|  | RandomInput16(gy_, GetParam(), 16); | 
|  | RandomInput16(pdiff_, GetParam(), bd + 1); | 
|  |  | 
|  | TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed); | 
|  | } | 
|  | if (is_speed) return; | 
|  |  | 
|  | // Extreme value test | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | RandomInput16Extreme(gx_, GetParam(), 16); | 
|  | RandomInput16Extreme(gy_, GetParam(), 16); | 
|  | RandomInput16Extreme(pdiff_, GetParam(), bd + 1); | 
|  |  | 
|  | TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed); | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | void TestCalcAffineAutoCorrelationMatrix(const int16_t *pdiff, int16_t *gx, | 
|  | int16_t *gy, const int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  | int pstride = bw; | 
|  | int gstride = bw; | 
|  |  | 
|  | calc_affine_autocorrelation_matrix ref_func = | 
|  | av1_calc_affine_autocorrelation_matrix_c; | 
|  | calc_affine_autocorrelation_matrix test_func = GetParam().TestFunction(); | 
|  |  | 
|  | if (is_speed) | 
|  | CalcAffineAutoCorrelationMatrixSpeed(ref_func, test_func, pdiff, pstride, | 
|  | gx, gy, gstride, bw, bh); | 
|  | else | 
|  | AffineAutoCorrelationMatrix(ref_func, test_func, pdiff, pstride, gx, gy, | 
|  | gstride, bw, bh); | 
|  | } | 
|  |  | 
|  | void AffineAutoCorrelationMatrix(calc_affine_autocorrelation_matrix ref_func, | 
|  | calc_affine_autocorrelation_matrix test_func, | 
|  | const int16_t *pdiff, int pstride, | 
|  | const int16_t *gx, const int16_t *gy, | 
|  | int gstride, int bw, int bh) { | 
|  | DECLARE_ALIGNED(32, int32_t, mat_ref[16]); | 
|  | DECLARE_ALIGNED(32, int32_t, mat_test[16]); | 
|  | DECLARE_ALIGNED(32, int32_t, vec_ref[4]); | 
|  | DECLARE_ALIGNED(32, int32_t, vec_test[4]); | 
|  | memset(mat_ref, 0, sizeof(mat_ref)); | 
|  | memset(mat_test, 0, sizeof(mat_test)); | 
|  | memset(vec_ref, 0, sizeof(vec_ref)); | 
|  | memset(vec_test, 0, sizeof(vec_test)); | 
|  | ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, | 
|  | #if CONFIG_AFFINE_REFINEMENT_SB | 
|  | 0, 0, | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT_SB | 
|  | mat_ref, vec_ref); | 
|  | test_func(pdiff, pstride, gx, gy, gstride, bw, bh, | 
|  | #if CONFIG_AFFINE_REFINEMENT_SB | 
|  | 0, 0, | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT_SB | 
|  | mat_test, vec_test); | 
|  |  | 
|  | int failed = 0; | 
|  | for (int i = 0; i < 16; ++i) { | 
|  | if (mat_ref[i] != mat_test[i]) { | 
|  | failed = 1; | 
|  | printf("Mat [%4d] ref %d test %d \n", i, mat_ref[i], mat_test[i]); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | for (int i = 0; i < 4; ++i) { | 
|  | if (vec_ref[i] != vec_test[i]) { | 
|  | failed = 1; | 
|  | printf("Vec [%4d] ref %d test %d \n", i, vec_ref[i], vec_test[i]); | 
|  | break; | 
|  | } | 
|  | } | 
|  | ASSERT_EQ(failed, 0); | 
|  | } | 
|  |  | 
|  | void CalcAffineAutoCorrelationMatrixSpeed( | 
|  | calc_affine_autocorrelation_matrix ref_func, | 
|  | calc_affine_autocorrelation_matrix test_func, const int16_t *pdiff, | 
|  | int pstride, const int16_t *gx, const int16_t *gy, int gstride, int bw, | 
|  | int bh) { | 
|  | DECLARE_ALIGNED(32, int32_t, mat_ref[16]); | 
|  | DECLARE_ALIGNED(32, int32_t, mat_test[16]); | 
|  | DECLARE_ALIGNED(32, int32_t, vec_ref[4]); | 
|  | DECLARE_ALIGNED(32, int32_t, vec_test[4]); | 
|  | memset(mat_ref, 0, sizeof(mat_ref)); | 
|  | memset(mat_test, 0, sizeof(mat_test)); | 
|  | memset(vec_ref, 0, sizeof(vec_ref)); | 
|  | memset(vec_test, 0, sizeof(vec_test)); | 
|  | const int knumIter = 1000000; | 
|  | aom_usec_timer timer_ref; | 
|  | aom_usec_timer timer_test; | 
|  |  | 
|  | aom_usec_timer_start(&timer_ref); | 
|  | for (int count = 0; count < knumIter; count++) { | 
|  | ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, | 
|  | #if CONFIG_AFFINE_REFINEMENT_SB | 
|  | 0, 0, | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT_SB | 
|  | mat_ref, vec_ref); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_ref); | 
|  |  | 
|  | aom_usec_timer_start(&timer_test); | 
|  | for (int count = 0; count < knumIter; count++) { | 
|  | test_func(pdiff, pstride, gx, gy, gstride, bw, bh, | 
|  | #if CONFIG_AFFINE_REFINEMENT_SB | 
|  | 0, 0, | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT_SB | 
|  | mat_test, vec_test); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_test); | 
|  |  | 
|  | const int total_time_ref = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); | 
|  | const int total_time_test = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_test)); | 
|  |  | 
|  | printf( | 
|  | "Block size: %dx%d, ref_time = %d \t simd_time = %d \t Scaling = %4.2f " | 
|  | "\n", | 
|  | bw, bh, total_time_ref, total_time_test, | 
|  | (static_cast<float>(total_time_ref) / | 
|  | static_cast<float>(total_time_test))); | 
|  | } | 
|  |  | 
|  | int16_t *gx_; | 
|  | int16_t *gy_; | 
|  | int16_t *pdiff_; | 
|  | }; | 
|  |  | 
|  | TEST_P(AV1CalcAffineAutocorrelationMatrixTest, CheckOutput) { RunTest(0); } | 
|  | TEST_P(AV1CalcAffineAutocorrelationMatrixTest, DISABLED_Speed) { RunTest(1); } | 
|  |  | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | C, AV1CalcAffineAutocorrelationMatrixTest, | 
|  | BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_c)); | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | AVX2, AV1CalcAffineAutocorrelationMatrixTest, | 
|  | BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_avx2)); | 
|  | #endif  // HAVE_AVX2 | 
|  |  | 
|  | typedef void (*av1_avg_pooling_pdiff_gradients_fun)( | 
|  | int16_t *pdiff, const int pstride, int16_t *gx, int16_t *gy, | 
|  | const int gstride, const int bw, const int bh, const int n); | 
|  |  | 
|  | class AV1AvgPoolingPdiffGradientTest | 
|  | : public AV1OptFlowTest<av1_avg_pooling_pdiff_gradients_fun> { | 
|  | public: | 
|  | AV1AvgPoolingPdiffGradientTest() { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  |  | 
|  | gx_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | gy_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | pdiff_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | gx_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | gy_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | pdiff_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); | 
|  | } | 
|  |  | 
|  | ~AV1AvgPoolingPdiffGradientTest() { | 
|  | aom_free(gx_avg1_); | 
|  | aom_free(gy_avg1_); | 
|  | aom_free(pdiff_avg1_); | 
|  | aom_free(gx_avg2_); | 
|  | aom_free(gy_avg2_); | 
|  | aom_free(pdiff_avg2_); | 
|  | } | 
|  |  | 
|  | void RunTest(const int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bd = GetParam().BitDepth(); | 
|  | const int bw_log2 = block.Width() >> MI_SIZE_LOG2; | 
|  | const int bh_log2 = block.Height() >> MI_SIZE_LOG2; | 
|  | const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); | 
|  |  | 
|  | // AVX2 version only supports avg pooling from larger size to 16x16 | 
|  | if (block.Width() <= 8 || block.Height() <= 8) return; | 
|  |  | 
|  | for (int count = 0; count < numIter;) { | 
|  | RandomInput16(gx_avg1_, GetParam(), 16); | 
|  | RandomInput16(gy_avg1_, GetParam(), 16); | 
|  | RandomInput16(pdiff_avg1_, GetParam(), bd + 1); | 
|  | memcpy(gx_avg2_, gx_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  | memcpy(gy_avg2_, gy_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  | memcpy(pdiff_avg2_, pdiff_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  |  | 
|  | TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_, | 
|  | gx_avg2_, gy_avg2_, is_speed); | 
|  | count++; | 
|  | } | 
|  | if (is_speed) return; | 
|  |  | 
|  | // Extreme value test | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | RandomInput16Extreme(gx_avg1_, GetParam(), 16); | 
|  | RandomInput16Extreme(gy_avg1_, GetParam(), 16); | 
|  | RandomInput16Extreme(pdiff_avg1_, GetParam(), bd + 1); | 
|  | memcpy(gx_avg2_, gx_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  | memcpy(gy_avg2_, gy_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  | memcpy(pdiff_avg2_, pdiff_avg1_, | 
|  | sizeof(int16_t) * block.Width() * block.Height()); | 
|  |  | 
|  | TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_, | 
|  | gx_avg2_, gy_avg2_, is_speed); | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | void TestAvgPoolingPdiffGrad(int16_t *pdiff_avg1, int16_t *gx_avg1, | 
|  | int16_t *gy_avg1, int16_t *pdiff_avg2, | 
|  | int16_t *gx_avg2, int16_t *gy_avg2, | 
|  | const int is_speed) { | 
|  | const BlockSize &block = GetParam().Block(); | 
|  | const int bw = block.Width(); | 
|  | const int bh = block.Height(); | 
|  | int pstride = bw; | 
|  | int gstride = bw; | 
|  |  | 
|  | av1_avg_pooling_pdiff_gradients_fun ref_func = | 
|  | av1_avg_pooling_pdiff_gradients_c; | 
|  | av1_avg_pooling_pdiff_gradients_fun test_func = GetParam().TestFunction(); | 
|  |  | 
|  | if (is_speed) | 
|  | AvgPoolingPdiffGradSpeed(ref_func, test_func, pstride, gstride, bw, bh, | 
|  | pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2, | 
|  | gx_avg2, gy_avg2); | 
|  | else | 
|  | AvgPoolingPdiffGrad(ref_func, test_func, pstride, gstride, bw, bh, | 
|  | pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2, gx_avg2, | 
|  | gy_avg2); | 
|  | } | 
|  |  | 
|  | void AvgPoolingPdiffGrad(av1_avg_pooling_pdiff_gradients_fun ref_func, | 
|  | av1_avg_pooling_pdiff_gradients_fun test_func, | 
|  | int pstride, int gstride, int bw, int bh, | 
|  | int16_t *pdiff_avg1, int16_t *gx_avg1, | 
|  | int16_t *gy_avg1, int16_t *pdiff_avg2, | 
|  | int16_t *gx_avg2, int16_t *gy_avg2) { | 
|  | int n = AOMMIN(AOMMIN(bw, bh), 16); | 
|  | ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n); | 
|  | test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n); | 
|  | AssertOutputBufferEq(pdiff_avg1, pdiff_avg2, n, n, bw); | 
|  | AssertOutputBufferEq(gx_avg1, gx_avg2, n, n, bw); | 
|  | AssertOutputBufferEq(gy_avg1, gy_avg2, n, n, bw); | 
|  | } | 
|  |  | 
|  | void AvgPoolingPdiffGradSpeed(av1_avg_pooling_pdiff_gradients_fun ref_func, | 
|  | av1_avg_pooling_pdiff_gradients_fun test_func, | 
|  | int pstride, int gstride, int bw, int bh, | 
|  | int16_t *pdiff_avg1, int16_t *gx_avg1, | 
|  | int16_t *gy_avg1, int16_t *pdiff_avg2, | 
|  | int16_t *gx_avg2, int16_t *gy_avg2) { | 
|  | int n = AOMMIN(AOMMIN(bw, bh), 16); | 
|  |  | 
|  | const int numIter = 1000000; | 
|  | aom_usec_timer timer_ref; | 
|  | aom_usec_timer timer_test; | 
|  |  | 
|  | aom_usec_timer_start(&timer_ref); | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_ref); | 
|  |  | 
|  | aom_usec_timer_start(&timer_test); | 
|  | for (int count = 0; count < numIter; count++) { | 
|  | test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n); | 
|  | } | 
|  | aom_usec_timer_mark(&timer_test); | 
|  |  | 
|  | const int total_time_ref = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); | 
|  | const int total_time_test = | 
|  | static_cast<int>(aom_usec_timer_elapsed(&timer_test)); | 
|  |  | 
|  | printf( | 
|  | "Block size: %dx%d, C time = %d \t SIMD time = %d \t Scaling = %4.2f " | 
|  | "\n", | 
|  | bw, bh, total_time_ref, total_time_test, | 
|  | (static_cast<float>(total_time_ref) / | 
|  | static_cast<float>(total_time_test))); | 
|  | } | 
|  |  | 
|  | int16_t *gx_avg1_; | 
|  | int16_t *gy_avg1_; | 
|  | int16_t *pdiff_avg1_; | 
|  | int16_t *gx_avg2_; | 
|  | int16_t *gy_avg2_; | 
|  | int16_t *pdiff_avg2_; | 
|  | }; | 
|  |  | 
|  | TEST_P(AV1AvgPoolingPdiffGradientTest, CheckOutput) { RunTest(0); } | 
|  | TEST_P(AV1AvgPoolingPdiffGradientTest, DISABLED_Speed) { RunTest(1); } | 
|  |  | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | C, AV1AvgPoolingPdiffGradientTest, | 
|  | BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_c)); | 
|  |  | 
|  | #if HAVE_AVX2 | 
|  | INSTANTIATE_TEST_SUITE_P( | 
|  | AVX2, AV1AvgPoolingPdiffGradientTest, | 
|  | BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_avx2)); | 
|  | #endif  // HAVE_AVX2 | 
|  | #endif  // CONFIG_AFFINE_REFINEMENT | 
|  | }  // namespace |