| /* |
| * Copyright (c) 2021, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 3-Clause Clear License |
| * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| * License was not distributed with this source code in the LICENSE file, you |
| * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| * Alliance for Open Media Patent License 1.0 was not distributed with this |
| * source code in the PATENTS file, you can obtain it at |
| * aomedia.org/license/patent-license/. |
| */ |
| |
| #include <set> |
| #include <vector> |
| #include "config/av1_rtcd.h" |
| #include "config/aom_dsp_rtcd.h" |
| #include "test/acm_random.h" |
| #include "test/clear_system_state.h" |
| #include "third_party/googletest/src/googletest/include/gtest/gtest.h" |
| |
| #include "aom_ports/aom_timer.h" |
| #include "av1/common/reconinter.h" |
| #include "av1/common/mvref_common.h" |
| |
| #if CONFIG_OPTFLOW_REFINEMENT |
| namespace { |
| |
| class BlockSize { |
| public: |
| BlockSize(int w, int h) : width_(w), height_(h) { |
| n_ = (w <= 8 && h <= 8) ? OF_MIN_BSIZE : OF_BSIZE; |
| } |
| |
| int Width() const { return width_; } |
| int Height() const { return height_; } |
| int OptFlowBlkSize() const { return n_; } |
| |
| bool operator<(const BlockSize &other) const { |
| if (Width() == other.Width()) { |
| return Height() < other.Height(); |
| } |
| return Width() < other.Width(); |
| } |
| |
| bool operator==(const BlockSize &other) const { |
| return Width() == other.Width() && Height() == other.Height(); |
| } |
| |
| private: |
| int width_; |
| int height_; |
| int n_; |
| }; |
| |
| // Block size / bit depth / test function used to parameterize the tests. |
| template <typename T> |
| class TestParam { |
| public: |
| TestParam(const BlockSize &block, int bd, T test_func) |
| : block_(block), bd_(bd), test_func_(test_func) {} |
| |
| const BlockSize &Block() const { return block_; } |
| int BitDepth() const { return bd_; } |
| T TestFunction() const { return test_func_; } |
| |
| bool operator==(const TestParam &other) const { |
| return Block() == other.Block() && BitDepth() == other.BitDepth() && |
| TestFunction() == other.TestFunction(); |
| } |
| |
| private: |
| BlockSize block_; |
| int bd_; |
| T test_func_; |
| }; |
| |
| template <typename T> |
| std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) { |
| return os << "TestParam { width:" << test_arg.Block().Width() |
| << " height:" << test_arg.Block().Height() |
| << " bd:" << test_arg.BitDepth() << " }"; |
| } |
| |
| // AV1OptFlowTest is the base class that all optical flow tests should derive |
| // from. |
| template <typename T> |
| class AV1OptFlowTest : public ::testing::TestWithParam<TestParam<T>> { |
| public: |
| virtual ~AV1OptFlowTest() { TearDown(); } |
| |
| virtual void SetUp() override { |
| rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); |
| } |
| |
| virtual void TearDown() override { libaom_test::ClearSystemState(); } |
| |
| // Check that two 8-bit output buffers are identical. |
| void AssertOutputEq(const int *ref, const int *test, int n) { |
| ASSERT_TRUE(ref != test) << "Buffers must be at different memory locations"; |
| for (int idx = 0; idx < n; ++idx) { |
| ASSERT_EQ(ref[idx], test[idx]) << "Mismatch at index " << idx; |
| } |
| } |
| |
| // Check that two 16-bit output buffers are identical. |
| void AssertOutputBufferEq(const int16_t *ref, const int16_t *test, int width, |
| int height, int stride) { |
| ASSERT_TRUE(ref != test) << "Buffers must be in different memory locations"; |
| for (int row = 0; row < height; ++row) { |
| for (int col = 0; col < width; ++col) { |
| ASSERT_EQ(ref[row * stride + col], test[row * stride + col]) |
| << width << "x" << height << " Pixel mismatch at (" << col << ", " |
| << row << ")"; |
| } |
| } |
| } |
| |
| uint8_t RandomFrameIdx(int max_bit_range) { |
| const int max_val = (1 << max_bit_range) - 1; |
| uint8_t rand_val = rnd_.Rand8() & max_val; |
| return rand_val; |
| } |
| |
| int8_t RelativeDistExtreme(int max_bit_range) { |
| return Rand8SingedExtremes(max_bit_range); |
| } |
| |
| void RandomInput8(uint8_t *p, const TestParam<T> ¶m) { |
| EXPECT_EQ(8, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| Randomize(p, bw * bh); |
| } |
| |
| void Randomize9Signed(int16_t *p, int size) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = rnd_.Rand9Signed(); |
| } |
| } |
| |
| void RandomInput9(int16_t *p, const TestParam<T> ¶m) { |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| Randomize9Signed(p, bw * bh); |
| } |
| |
| void RandomInput16(uint16_t *p, const TestParam<T> ¶m, |
| int max_bit_range) { |
| EXPECT_GE(12, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| Randomize(p, bw * bh, max_bit_range); |
| } |
| |
| void RandomInput16(int16_t *p, const TestParam<T> ¶m, int max_bit_range) { |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| Randomize(p, bw * bh, max_bit_range); |
| } |
| |
| void RandomInput8Extreme(uint8_t *p, const TestParam<T> ¶m) { |
| EXPECT_EQ(8, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| RandomizeExtreme(p, bw * bh); |
| } |
| |
| void RandomInput9Extreme(int16_t *p, const TestParam<T> ¶m, |
| int max_bit_range) { |
| EXPECT_GE(12, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| Randomize9Extreme(p, bw * bh, max_bit_range); |
| } |
| |
| void RandomInput16Extreme(uint16_t *p, const TestParam<T> ¶m, |
| int max_bit_range) { |
| EXPECT_GE(12, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| RandomizeExtreme(p, bw * bh, max_bit_range); |
| } |
| |
| void RandomInput16Extreme(int16_t *p, const TestParam<T> ¶m, |
| int max_bit_range) { |
| EXPECT_GE(12, param.BitDepth()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); |
| EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); |
| const int bw = param.Block().Width(); |
| const int bh = param.Block().Height(); |
| RandomizeExtreme(p, bw * bh, max_bit_range); |
| } |
| |
| private: |
| void Randomize(uint8_t *p, int size) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = rnd_.Rand8(); |
| } |
| } |
| |
| void Randomize(uint16_t *p, int size, int max_bit_range) { |
| assert(max_bit_range <= 16); |
| for (int i = 0; i < size; ++i) { |
| p[i] = rnd_.Rand16() & ((1 << max_bit_range) - 1); |
| } |
| } |
| |
| void Randomize(int16_t *p, int size, int max_bit_range) { |
| assert(max_bit_range <= 16); |
| for (int i = 0; i < size; ++i) { |
| p[i] = (rnd_.Rand16() & ((1 << max_bit_range) - 1)) - |
| (1 << (max_bit_range - 1)); |
| } |
| } |
| |
| int RandBool() { |
| const uint32_t value = rnd_.Rand8(); |
| // There's a bit more entropy in the upper bits of this implementation. |
| return (value >> 7) & 0x1; |
| } |
| |
| uint8_t Rand8Extremes() { return static_cast<uint8_t>(RandBool() ? 255 : 0); } |
| |
| int8_t Rand8SingedExtremes(int max_bit_range) { |
| const int max_val = (1 << max_bit_range) - 1; |
| const int half_max_val = 1 << (max_bit_range - 1); |
| uint8_t r_u8 = Rand8Extremes() & max_val; |
| return static_cast<int8_t>(r_u8 - half_max_val); |
| } |
| |
| int16_t Rand9SingedExtremes(int max_bit_range) { |
| const int half_max_val = 1 << (max_bit_range - 1); |
| uint16_t r_u16 = Rand16Extremes(max_bit_range); |
| return static_cast<int16_t>(r_u16 - half_max_val); |
| } |
| |
| uint16_t Rand16Extremes(int max_bit_range) { |
| const int max_val = (1 << max_bit_range) - 1; |
| return static_cast<uint16_t>(RandBool() ? max_val : 0); |
| } |
| |
| int16_t Rand16SingedExtremes(int max_bit_range) { |
| const int half_max_val = 1 << (max_bit_range - 1); |
| uint16_t r_u16 = Rand16Extremes(max_bit_range); |
| return static_cast<int16_t>(r_u16 - half_max_val); |
| } |
| |
| void RandomizeExtreme(uint8_t *p, int size) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = Rand8Extremes(); |
| } |
| } |
| |
| void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = Rand16Extremes(max_bit_range); |
| } |
| } |
| |
| void RandomizeExtreme(int16_t *p, int size, int max_bit_range) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = Rand16SingedExtremes(max_bit_range); |
| } |
| } |
| |
| void Randomize9Extreme(int16_t *p, int size, int max_bit_range) { |
| for (int i = 0; i < size; ++i) { |
| p[i] = Rand9SingedExtremes(max_bit_range); |
| } |
| } |
| |
| libaom_test::ACMRandom rnd_; |
| }; |
| |
| // a function to generate test parameters for just luma block sizes. |
| template <typename T> |
| std::vector<TestParam<T>> GetOptFlowTestParams( |
| std::initializer_list<int> bit_depths, T test_func) { |
| std::set<BlockSize> sizes; |
| for (int bsize = BLOCK_8X8; bsize < BLOCK_SIZES_ALL; ++bsize) { |
| const int w = block_size_wide[bsize]; |
| const int h = block_size_high[bsize]; |
| if (w < 8 || h < 8) continue; |
| sizes.insert(BlockSize(w, h)); |
| } |
| std::vector<TestParam<T>> result; |
| for (int bit_depth : bit_depths) { |
| for (const auto &block : sizes) { |
| result.push_back(TestParam<T>(block, bit_depth, test_func)); |
| } |
| } |
| return result; |
| } |
| |
| template <typename T> |
| std::vector<TestParam<T>> GetOptFlowHighbdTestParams(T test_func) { |
| return GetOptFlowTestParams({ 8, 10, 12 }, test_func); |
| } |
| |
| template <typename T> |
| ::testing::internal::ParamGenerator<TestParam<T>> BuildOptFlowHighbdParams( |
| T test_func) { |
| return ::testing::ValuesIn(GetOptFlowHighbdTestParams(test_func)); |
| } |
| |
| typedef int (*opfl_mv_refinement_highbd)(const uint16_t *p0, int pstride0, |
| const uint16_t *p1, int pstride1, |
| const int16_t *gx0, const int16_t *gy0, |
| const int16_t *gx1, const int16_t *gy1, |
| int gstride, int bw, int bh, int n, |
| int d0, int d1, int grad_prec_bits, |
| int mv_prec_bits, int *vx0, int *vy0, |
| int *vx1, int *vy1); |
| |
| class AV1OptFlowRefineHighbdTest |
| : public AV1OptFlowTest<opfl_mv_refinement_highbd> { |
| public: |
| AV1OptFlowRefineHighbdTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| input0_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(uint16_t)); |
| input1_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(uint16_t)); |
| gx0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gy0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gx1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gy1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| } |
| |
| ~AV1OptFlowRefineHighbdTest() { |
| aom_free(input0_); |
| aom_free(input1_); |
| aom_free(gx0_); |
| aom_free(gy0_); |
| aom_free(gx1_); |
| aom_free(gy1_); |
| } |
| |
| void RunTest(const int is_speed) { |
| OrderHintInfo oh_info; |
| const BlockSize &block = GetParam().Block(); |
| const int bd = GetParam().BitDepth(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1; |
| |
| oh_info.enable_order_hint = 1; |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) { |
| for (int count = 0; count < numIter;) { |
| const int cur_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref0_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref1_frm_idx = RandomFrameIdx(oh_bits); |
| |
| oh_info.order_hint_bits_minus_1 = oh_bits - 1; |
| const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx); |
| const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx); |
| if (!d0 || !d1) continue; |
| |
| RandomInput16(input0_, GetParam(), bd); |
| RandomInput16(input1_, GetParam(), bd); |
| RandomInput16(gx0_, GetParam(), bd + 1); |
| RandomInput16(gy0_, GetParam(), bd + 1); |
| RandomInput16(gx1_, GetParam(), bd + 1); |
| RandomInput16(gy1_, GetParam(), bd + 1); |
| |
| TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, is_speed, |
| d0, d1); |
| count++; |
| } |
| } |
| if (is_speed) return; |
| |
| // Extreme value test |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; |
| oh_bits += kMaxOrderHintBits - 1) { |
| for (int count = 0; count < numIter;) { |
| const int d0 = RelativeDistExtreme(oh_bits); |
| const int d1 = RelativeDistExtreme(oh_bits); |
| if (!d0 || !d1) continue; |
| |
| RandomInput16Extreme(input0_, GetParam(), bd); |
| RandomInput16Extreme(input1_, GetParam(), bd); |
| RandomInput16Extreme(gx0_, GetParam(), bd + 1); |
| RandomInput16Extreme(gy0_, GetParam(), bd + 1); |
| RandomInput16Extreme(gx1_, GetParam(), bd + 1); |
| RandomInput16Extreme(gy1_, GetParam(), bd + 1); |
| |
| TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, 0, d0, d1); |
| count++; |
| } |
| } |
| } |
| |
| private: |
| void TestOptFlowRefine(uint16_t *input0, uint16_t *input1, int16_t *gx0, |
| int16_t *gy0, int16_t *gx1, int16_t *gy1, |
| const int is_speed, int d0, int d1) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| const int n = block.OptFlowBlkSize(); |
| |
| opfl_mv_refinement_highbd ref_func = av1_opfl_mv_refinement_nxn_highbd_c; |
| opfl_mv_refinement_highbd test_func = GetParam().TestFunction(); |
| |
| if (is_speed) |
| OptFlowRefineSpeed(ref_func, test_func, input0, input1, gx0, gy0, gx1, |
| gy1, bw, bh, n, d0, d1); |
| else |
| OptFlowRefine(ref_func, test_func, input0, input1, gx0, gy0, gx1, gy1, bw, |
| bh, n, d0, d1); |
| } |
| |
| void OptFlowRefine(opfl_mv_refinement_highbd ref_func, |
| opfl_mv_refinement_highbd test_func, |
| const uint16_t *input0, const uint16_t *input1, |
| const int16_t *gx0, const int16_t *gy0, const int16_t *gx1, |
| const int16_t *gy1, int bw, int bh, int n, int d0, |
| int d1) { |
| int ref_out[4 * N_OF_OFFSETS] = { 0 }; |
| int test_out[4 * N_OF_OFFSETS] = { 0 }; |
| const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; |
| const int mv_prec_bits = MV_REFINE_PREC_BITS; |
| int stride0 = bw; |
| int stride1 = bw; |
| int gstride = bw; |
| int n_blocks = 0; |
| |
| n_blocks = ref_func( |
| input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh, |
| n, d0, d1, grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS], |
| &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS], |
| &ref_out[kVY_1 * N_OF_OFFSETS]); |
| test_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, |
| bh, n, d0, d1, grad_prec_bits, mv_prec_bits, |
| &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS], |
| &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]); |
| |
| AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS], |
| &test_out[kVX_0 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS], |
| &test_out[kVY_0 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS], |
| &test_out[kVX_1 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS], |
| &test_out[kVY_1 * N_OF_OFFSETS], n_blocks); |
| } |
| |
| void OptFlowRefineSpeed(opfl_mv_refinement_highbd ref_func, |
| opfl_mv_refinement_highbd test_func, |
| const uint16_t *input0, const uint16_t *input1, |
| const int16_t *gx0, const int16_t *gy0, |
| const int16_t *gx1, const int16_t *gy1, int bw, |
| int bh, int n, int d0, int d1) { |
| int ref_out[4 * N_OF_OFFSETS] = { 0 }; |
| int test_out[4 * N_OF_OFFSETS] = { 0 }; |
| const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; |
| const int mv_prec_bits = MV_REFINE_PREC_BITS; |
| const int bw_log2 = bw >> MI_SIZE_LOG2; |
| const int bh_log2 = bh >> MI_SIZE_LOG2; |
| int stride0 = bw; |
| int stride1 = bw; |
| int gstride = bw; |
| |
| const int numIter = 2097152 / (bw_log2 * bh_log2); |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < numIter; count++) { |
| ref_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, |
| bw, bh, n, d0, d1, grad_prec_bits, mv_prec_bits, |
| &ref_out[kVX_0 * N_OF_OFFSETS], &ref_out[kVY_0 * N_OF_OFFSETS], |
| &ref_out[kVX_1 * N_OF_OFFSETS], &ref_out[kVY_1 * N_OF_OFFSETS]); |
| } |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < numIter; count++) { |
| test_func( |
| input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh, |
| n, d0, d1, grad_prec_bits, mv_prec_bits, |
| &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS], |
| &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]); |
| } |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, |
| total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| static constexpr int kVX_0 = 0; |
| static constexpr int kVX_1 = 1; |
| static constexpr int kVY_0 = 2; |
| static constexpr int kVY_1 = 3; |
| static constexpr int kMaxOrderHintBits = 8; |
| static constexpr int kSubpelGradDeltaBits = 3; |
| uint16_t *input0_; |
| uint16_t *input1_; |
| int16_t *gx0_; |
| int16_t *gy0_; |
| int16_t *gx1_; |
| int16_t *gy1_; |
| }; |
| TEST_P(AV1OptFlowRefineHighbdTest, CheckOutput) { RunTest(0); } |
| TEST_P(AV1OptFlowRefineHighbdTest, DISABLED_Speed) { RunTest(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1OptFlowRefineHighbdTest, |
| BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_highbd_c)); |
| |
| #if HAVE_SSE4_1 |
| INSTANTIATE_TEST_SUITE_P( |
| SSE4_1, AV1OptFlowRefineHighbdTest, |
| BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_highbd_sse4_1)); |
| #endif |
| |
| #if OPFL_BICUBIC_GRAD |
| typedef void (*bicubic_grad_interp_highbd)(const int16_t *pred_src, |
| int16_t *x_grad, int16_t *y_grad, |
| const int blk_width, |
| const int blk_height); |
| |
| class AV1OptFlowBiCubicGradHighbdTest |
| : public AV1OptFlowTest<bicubic_grad_interp_highbd> { |
| public: |
| AV1OptFlowBiCubicGradHighbdTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| pred_src_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| x_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| y_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| x_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| y_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| |
| memset(x_grad_ref_, 0, bw * bh * sizeof(int16_t)); |
| memset(y_grad_ref_, 0, bw * bh * sizeof(int16_t)); |
| memset(x_grad_test_, 0, bw * bh * sizeof(int16_t)); |
| memset(y_grad_test_, 0, bw * bh * sizeof(int16_t)); |
| } |
| |
| ~AV1OptFlowBiCubicGradHighbdTest() { |
| aom_free(pred_src_); |
| aom_free(x_grad_ref_); |
| aom_free(y_grad_ref_); |
| aom_free(x_grad_test_); |
| aom_free(y_grad_test_); |
| } |
| |
| void Run(const int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bd = GetParam().BitDepth(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| |
| for (int count = 0; count < numIter; count++) { |
| RandomInput16(pred_src_, GetParam(), bd); |
| TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_, |
| y_grad_test_, is_speed); |
| } |
| if (is_speed) return; |
| |
| for (int count = 0; count < numIter; count++) { |
| RandomInput16Extreme((uint16_t *)pred_src_, GetParam(), bd); |
| TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_, |
| y_grad_test_, 0); |
| } |
| } |
| |
| private: |
| void TestBicubicGradHighbd(int16_t *pred_src, int16_t *x_grad_ref, |
| int16_t *y_grad_ref, int16_t *x_grad_test, |
| int16_t *y_grad_test, int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| bicubic_grad_interp_highbd ref_func = |
| av1_bicubic_grad_interpolation_highbd_c; |
| bicubic_grad_interp_highbd test_func = GetParam().TestFunction(); |
| if (is_speed) |
| BicubicGradHighbdSpeed(ref_func, test_func, pred_src, x_grad_ref, |
| y_grad_ref, x_grad_test, y_grad_test, bw, bh); |
| else |
| BicubicGradHighbd(ref_func, test_func, pred_src, x_grad_ref, y_grad_ref, |
| x_grad_test, y_grad_test, bw, bh); |
| } |
| |
| void BicubicGradHighbd(bicubic_grad_interp_highbd ref_func, |
| bicubic_grad_interp_highbd test_func, |
| const int16_t *pred_src, int16_t *x_grad_ref, |
| int16_t *y_grad_ref, int16_t *x_grad_test, |
| int16_t *y_grad_test, const int bw, const int bh) { |
| ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh); |
| test_func(pred_src, x_grad_test, y_grad_test, bw, bh); |
| |
| AssertOutputBufferEq(x_grad_ref, x_grad_test, bw, bh, bw); |
| AssertOutputBufferEq(y_grad_ref, y_grad_test, bw, bh, bw); |
| } |
| |
| void BicubicGradHighbdSpeed(bicubic_grad_interp_highbd ref_func, |
| bicubic_grad_interp_highbd test_func, |
| int16_t *pred_src, int16_t *x_grad_ref, |
| int16_t *y_grad_ref, int16_t *x_grad_test, |
| int16_t *y_grad_test, const int bw, |
| const int bh) { |
| const int bw_log2 = bw >> MI_SIZE_LOG2; |
| const int bh_log2 = bh >> MI_SIZE_LOG2; |
| |
| const int numIter = 2097152 / (bw_log2 * bh_log2); |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < numIter; count++) |
| ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh); |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < numIter; count++) |
| test_func(pred_src, x_grad_test, y_grad_test, bw, bh); |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, |
| total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| int16_t *pred_src_; |
| int16_t *x_grad_ref_; |
| int16_t *y_grad_ref_; |
| int16_t *x_grad_test_; |
| int16_t *y_grad_test_; |
| }; |
| TEST_P(AV1OptFlowBiCubicGradHighbdTest, CheckOutput) { Run(0); } |
| TEST_P(AV1OptFlowBiCubicGradHighbdTest, DISABLED_Speed) { Run(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1OptFlowBiCubicGradHighbdTest, |
| BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_c)); |
| |
| #if HAVE_SSE4_1 |
| INSTANTIATE_TEST_SUITE_P( |
| SSE4_1, AV1OptFlowBiCubicGradHighbdTest, |
| BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_sse4_1)); |
| #endif |
| #endif // OPFL_BICUBIC_GRAD |
| |
| #if OPFL_COMBINE_INTERP_GRAD_LS |
| typedef int (*opfl_mv_refinement_interp_grad)( |
| const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy, |
| int gstride, int bw, int bh, int n, int d0, int d1, int grad_prec_bits, |
| int mv_prec_bits, int *vx0, int *vy0, int *vx1, int *vy1); |
| |
| class AV1OptFlowRefineInterpGradTest |
| : public AV1OptFlowTest<opfl_mv_refinement_interp_grad> { |
| public: |
| AV1OptFlowRefineInterpGradTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| input_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*input_)); |
| gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gx_)); |
| gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gy_)); |
| } |
| |
| ~AV1OptFlowRefineInterpGradTest() { |
| aom_free(input_); |
| aom_free(gx_); |
| aom_free(gy_); |
| } |
| |
| void RunTest(const int is_speed) { |
| OrderHintInfo oh_info; |
| const BlockSize &block = GetParam().Block(); |
| const int bd = GetParam().BitDepth(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1; |
| |
| oh_info.enable_order_hint = 1; |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) { |
| for (int count = 0; count < numIter;) { |
| const int cur_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref0_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref1_frm_idx = RandomFrameIdx(oh_bits); |
| |
| oh_info.order_hint_bits_minus_1 = oh_bits - 1; |
| const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx); |
| const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx); |
| if (!d0 || !d1) continue; |
| |
| // Here, the input corresponds to 'd0*p0 - d1*p1' (where P0 and P1 can |
| // be 12 bits, d0 and d1 can be >=5 bits) and gx, gy are gradients of |
| // input. Due to the clamping of these value to [INT16_MIN, INT16_MAX], |
| // testing of the same is required. Hence, populating the input_, gx_ |
| // and gy_ buffers as per the requirement. |
| RandomInput16(input_, GetParam(), AOMMIN(16, bd + 1)); |
| RandomInput16(gx_, GetParam(), AOMMIN(16, bd + 6)); |
| RandomInput16(gy_, GetParam(), AOMMIN(16, bd + 6)); |
| |
| TestOptFlowRefine(input_, gx_, gy_, is_speed, d0, d1); |
| count++; |
| } |
| } |
| if (is_speed) return; |
| |
| // Extreme value test |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; |
| oh_bits += kMaxOrderHintBits - 1) { |
| for (int count = 0; count < numIter;) { |
| const int d0 = RelativeDistExtreme(oh_bits); |
| const int d1 = RelativeDistExtreme(oh_bits); |
| if (!d0 || !d1) continue; |
| |
| RandomInput16Extreme(input_, GetParam(), AOMMIN(16, bd + 1)); |
| RandomInput16Extreme(gx_, GetParam(), AOMMIN(16, bd + 6)); |
| RandomInput16Extreme(gy_, GetParam(), AOMMIN(16, bd + 6)); |
| |
| TestOptFlowRefine(input_, gx_, gy_, 0, d0, d1); |
| count++; |
| } |
| } |
| } |
| |
| private: |
| void TestOptFlowRefine(int16_t *input, int16_t *gx, int16_t *gy, |
| const int is_speed, int d0, int d1) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| const int n = block.OptFlowBlkSize(); |
| |
| opfl_mv_refinement_interp_grad ref_func = |
| av1_opfl_mv_refinement_nxn_interp_grad_c; |
| opfl_mv_refinement_interp_grad test_func = GetParam().TestFunction(); |
| |
| if (is_speed) |
| OptFlowRefineSpeed(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1); |
| else |
| OptFlowRefine(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1); |
| } |
| |
| void OptFlowRefine(opfl_mv_refinement_interp_grad ref_func, |
| opfl_mv_refinement_interp_grad test_func, |
| const int16_t *input, const int16_t *gx, const int16_t *gy, |
| int bw, int bh, int n, int d0, int d1) { |
| int ref_out[4 * N_OF_OFFSETS] = { 0 }; |
| int test_out[4 * N_OF_OFFSETS] = { 0 }; |
| const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; |
| const int mv_prec_bits = MV_REFINE_PREC_BITS; |
| int stride = bw; |
| int gstride = bw; |
| |
| int n_blocks_ref = |
| ref_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1, |
| grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS], |
| &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS], |
| &ref_out[kVY_1 * N_OF_OFFSETS]); |
| int n_blocks = test_func( |
| input, stride, gx, gy, gstride, bw, bh, n, d0, d1, grad_prec_bits, |
| mv_prec_bits, &test_out[kVX_0 * N_OF_OFFSETS], |
| &test_out[kVY_0 * N_OF_OFFSETS], &test_out[kVX_1 * N_OF_OFFSETS], |
| &test_out[kVY_1 * N_OF_OFFSETS]); |
| |
| ASSERT_EQ(n_blocks_ref, n_blocks) << "Mismatch of subblock numbers"; |
| AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS], |
| &test_out[kVX_0 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS], |
| &test_out[kVY_0 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS], |
| &test_out[kVX_1 * N_OF_OFFSETS], n_blocks); |
| AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS], |
| &test_out[kVY_1 * N_OF_OFFSETS], n_blocks); |
| } |
| |
| void OptFlowRefineSpeed(opfl_mv_refinement_interp_grad ref_func, |
| opfl_mv_refinement_interp_grad test_func, |
| const int16_t *input, const int16_t *gx, |
| const int16_t *gy, int bw, int bh, int n, int d0, |
| int d1) { |
| int ref_out[4 * N_OF_OFFSETS] = { 0 }; |
| int test_out[4 * N_OF_OFFSETS] = { 0 }; |
| const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2; |
| const int mv_prec_bits = MV_REFINE_PREC_BITS; |
| const int bw_log2 = bw >> MI_SIZE_LOG2; |
| const int bh_log2 = bh >> MI_SIZE_LOG2; |
| int stride = bw; |
| int gstride = bw; |
| |
| const int numIter = 2097152 / (bw_log2 * bh_log2); |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < numIter; count++) { |
| ref_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1, |
| grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS], |
| &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS], |
| &ref_out[kVY_1 * N_OF_OFFSETS]); |
| } |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < numIter; count++) { |
| test_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1, |
| grad_prec_bits, mv_prec_bits, &test_out[kVX_0 * N_OF_OFFSETS], |
| &test_out[kVY_0 * N_OF_OFFSETS], |
| &test_out[kVX_1 * N_OF_OFFSETS], |
| &test_out[kVY_1 * N_OF_OFFSETS]); |
| } |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, |
| total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| static constexpr int kVX_0 = 0; |
| static constexpr int kVX_1 = 1; |
| static constexpr int kVY_0 = 2; |
| static constexpr int kVY_1 = 3; |
| static constexpr int kMaxOrderHintBits = 8; |
| static constexpr int kSubpelGradDeltaBits = 3; |
| int16_t *input_; |
| int16_t *gx_; |
| int16_t *gy_; |
| }; |
| TEST_P(AV1OptFlowRefineInterpGradTest, CheckOutput) { RunTest(0); } |
| TEST_P(AV1OptFlowRefineInterpGradTest, DISABLED_Speed) { RunTest(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1OptFlowRefineInterpGradTest, |
| BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_interp_grad_c)); |
| |
| #if HAVE_SSE4_1 |
| INSTANTIATE_TEST_SUITE_P( |
| SSE4_1, AV1OptFlowRefineInterpGradTest, |
| BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_interp_grad_sse4_1)); |
| #endif |
| #endif // OPFL_COMBINE_INTERP_GRAD_LS |
| |
| #if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD |
| typedef void (*pred_buffer_copy_highbd)(const uint16_t *src1, |
| const uint16_t *src2, int16_t *dst1, |
| int16_t *dst2, int bw, int bh, int d0, |
| int d1, int centered); |
| |
| class AV1OptFlowCopyPredHighbdTest |
| : public AV1OptFlowTest<pred_buffer_copy_highbd> { |
| public: |
| AV1OptFlowCopyPredHighbdTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| src_buf1_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf1_)); |
| src_buf2_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf2_)); |
| dst_buf1_ref_ = |
| (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_ref_)); |
| dst_buf2_ref_ = |
| (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_ref_)); |
| dst_buf1_test_ = |
| (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_test_)); |
| dst_buf2_test_ = |
| (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_test_)); |
| |
| memset(dst_buf2_ref_, 0, bw * bh * sizeof(*dst_buf2_ref_)); |
| memset(dst_buf2_test_, 0, bw * bh * sizeof(*dst_buf2_test_)); |
| } |
| |
| ~AV1OptFlowCopyPredHighbdTest() { |
| aom_free(src_buf1_); |
| aom_free(src_buf2_); |
| aom_free(dst_buf1_ref_); |
| aom_free(dst_buf2_ref_); |
| aom_free(dst_buf1_test_); |
| aom_free(dst_buf2_test_); |
| } |
| |
| void Run(const int is_speed) { |
| OrderHintInfo oh_info; |
| const BlockSize &block = GetParam().Block(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int bd = GetParam().BitDepth(); |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1; |
| |
| oh_info.enable_order_hint = 1; |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) { |
| for (int count = 0; count < numIter;) { |
| const int cur_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref0_frm_idx = RandomFrameIdx(oh_bits); |
| const int ref1_frm_idx = RandomFrameIdx(oh_bits); |
| |
| oh_info.order_hint_bits_minus_1 = oh_bits - 1; |
| const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx); |
| const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx); |
| if (!d0 || !d1) continue; |
| |
| RandomInput16(src_buf1_, GetParam(), bd); |
| RandomInput16(src_buf2_, GetParam(), bd); |
| TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_, |
| dst_buf1_test_, dst_buf2_test_, d0, d1, is_speed); |
| count++; |
| } |
| } |
| if (is_speed) return; |
| |
| // Extreme value test |
| for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; |
| oh_bits += kMaxOrderHintBits - 1) { |
| for (int count = 0; count < numIter;) { |
| const int d0 = RelativeDistExtreme(oh_bits); |
| const int d1 = RelativeDistExtreme(oh_bits); |
| if (!d0 || !d1) continue; |
| |
| RandomInput16Extreme(src_buf1_, GetParam(), bd); |
| RandomInput16Extreme(src_buf2_, GetParam(), bd); |
| TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_, |
| dst_buf1_test_, dst_buf2_test_, d0, d1, 0); |
| count++; |
| } |
| } |
| } |
| |
| private: |
| void TestCopyPredArray(uint16_t *src_buf1, uint16_t *src_buf2, |
| int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, |
| int16_t *dst_buf1_test, int16_t *dst_buf2_test, int d0, |
| int d1, int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| pred_buffer_copy_highbd ref_func = av1_copy_pred_array_highbd_c; |
| pred_buffer_copy_highbd test_func = GetParam().TestFunction(); |
| if (is_speed) |
| CopyPredArraySpeed(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref, |
| dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, |
| bh); |
| else |
| CopyPredArray(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref, |
| dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, bh); |
| } |
| |
| void CopyPredArray(pred_buffer_copy_highbd ref_func, |
| pred_buffer_copy_highbd test_func, |
| const uint16_t *src_buf1, uint16_t *src_buf2, |
| int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, |
| int16_t *dst_buf1_test, int16_t *dst_buf2_test, |
| const int d0, const int d1, const int bw, const int bh) { |
| ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1, 0); |
| test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, d1, |
| 0); |
| |
| AssertOutputBufferEq(dst_buf1_ref, dst_buf1_test, bw, bh, bw); |
| AssertOutputBufferEq(dst_buf2_ref, dst_buf2_test, bw, bh, bw); |
| } |
| |
| void CopyPredArraySpeed(pred_buffer_copy_highbd ref_func, |
| pred_buffer_copy_highbd test_func, |
| const uint16_t *src_buf1, uint16_t *src_buf2, |
| int16_t *dst_buf1_ref, int16_t *dst_buf2_ref, |
| int16_t *dst_buf1_test, int16_t *dst_buf2_test, |
| const int d0, const int d1, const int bw, |
| const int bh) { |
| const int bw_log2 = bw >> MI_SIZE_LOG2; |
| const int bh_log2 = bh >> MI_SIZE_LOG2; |
| printf("bw=%d, bh=%d\n", bw, bh); |
| const int numIter = 2097152 / (bw_log2 * bh_log2); |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < numIter; count++) |
| ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1, |
| 0); |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < numIter; count++) |
| test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, |
| d1, 0); |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref, |
| total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| uint16_t *src_buf1_; |
| uint16_t *src_buf2_; |
| int16_t *dst_buf1_ref_; |
| int16_t *dst_buf2_ref_; |
| int16_t *dst_buf1_test_; |
| int16_t *dst_buf2_test_; |
| static constexpr int kMaxOrderHintBits = 8; |
| }; |
| |
| TEST_P(AV1OptFlowCopyPredHighbdTest, CheckOutput) { Run(0); } |
| TEST_P(AV1OptFlowCopyPredHighbdTest, DISABLED_Speed) { Run(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1OptFlowCopyPredHighbdTest, |
| BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_c)); |
| |
| #if HAVE_SSE4_1 |
| INSTANTIATE_TEST_SUITE_P( |
| SSE4_1, AV1OptFlowCopyPredHighbdTest, |
| BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_sse4_1)); |
| #endif |
| #endif // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD |
| |
| #if CONFIG_AFFINE_REFINEMENT |
| #if OPFL_COMBINE_INTERP_GRAD_LS |
| typedef void (*calc_affine_autocorrelation_matrix)( |
| const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy, |
| int gstride, int bw, int bh, int64_t *mat_a, int64_t *vec_b); |
| |
| class AV1CalcAffineAutocorrelationMatrixTest |
| : public AV1OptFlowTest<calc_affine_autocorrelation_matrix> { |
| public: |
| AV1CalcAffineAutocorrelationMatrixTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| pdiff_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| } |
| |
| ~AV1CalcAffineAutocorrelationMatrixTest() { |
| aom_free(gx_); |
| aom_free(gy_); |
| aom_free(pdiff_); |
| } |
| |
| void RunTest(const int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bd = GetParam().BitDepth(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| |
| for (int count = 0; count < numIter; count++) { |
| RandomInput16(gx_, GetParam(), 16); |
| RandomInput16(gy_, GetParam(), 16); |
| RandomInput16(pdiff_, GetParam(), bd + 1); |
| |
| TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed); |
| } |
| if (is_speed) return; |
| |
| // Extreme value test |
| for (int count = 0; count < numIter; count++) { |
| RandomInput16Extreme(gx_, GetParam(), 16); |
| RandomInput16Extreme(gy_, GetParam(), 16); |
| RandomInput16Extreme(pdiff_, GetParam(), bd + 1); |
| |
| TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed); |
| } |
| } |
| |
| private: |
| void TestCalcAffineAutoCorrelationMatrix(const int16_t *pdiff, int16_t *gx, |
| int16_t *gy, const int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| int pstride = bw; |
| int gstride = bw; |
| |
| calc_affine_autocorrelation_matrix ref_func = |
| av1_calc_affine_autocorrelation_matrix_c; |
| calc_affine_autocorrelation_matrix test_func = GetParam().TestFunction(); |
| |
| if (is_speed) |
| CalcAffineAutoCorrelationMatrixSpeed(ref_func, test_func, pdiff, pstride, |
| gx, gy, gstride, bw, bh); |
| else |
| AffineAutoCorrelationMatrix(ref_func, test_func, pdiff, pstride, gx, gy, |
| gstride, bw, bh); |
| } |
| |
| void AffineAutoCorrelationMatrix(calc_affine_autocorrelation_matrix ref_func, |
| calc_affine_autocorrelation_matrix test_func, |
| const int16_t *pdiff, int pstride, |
| const int16_t *gx, const int16_t *gy, |
| int gstride, int bw, int bh) { |
| DECLARE_ALIGNED(32, int64_t, mat_ref[16]); |
| DECLARE_ALIGNED(32, int64_t, mat_test[16]); |
| DECLARE_ALIGNED(32, int64_t, vec_ref[4]); |
| DECLARE_ALIGNED(32, int64_t, vec_test[4]); |
| memset(mat_ref, 0, sizeof(mat_ref)); |
| memset(mat_test, 0, sizeof(mat_test)); |
| memset(vec_ref, 0, sizeof(vec_ref)); |
| memset(vec_test, 0, sizeof(vec_test)); |
| ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_ref, vec_ref); |
| test_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_test, vec_test); |
| |
| int failed = 0; |
| for (int i = 0; i < 16; ++i) { |
| if (mat_ref[i] != mat_test[i]) { |
| failed = 1; |
| printf("Mat [%4d] ref %6" PRId64 " test %6" PRId64 " \n", i, mat_ref[i], |
| mat_test[i]); |
| break; |
| } |
| } |
| |
| for (int i = 0; i < 4; ++i) { |
| if (vec_ref[i] != vec_test[i]) { |
| failed = 1; |
| printf("Vec [%4d] ref %6" PRId64 " test %6" PRId64 " \n", i, vec_ref[i], |
| vec_test[i]); |
| break; |
| } |
| } |
| ASSERT_EQ(failed, 0); |
| } |
| |
| void CalcAffineAutoCorrelationMatrixSpeed( |
| calc_affine_autocorrelation_matrix ref_func, |
| calc_affine_autocorrelation_matrix test_func, const int16_t *pdiff, |
| int pstride, const int16_t *gx, const int16_t *gy, int gstride, int bw, |
| int bh) { |
| DECLARE_ALIGNED(32, int64_t, mat_ref[16]); |
| DECLARE_ALIGNED(32, int64_t, mat_test[16]); |
| DECLARE_ALIGNED(32, int64_t, vec_ref[4]); |
| DECLARE_ALIGNED(32, int64_t, vec_test[4]); |
| memset(mat_ref, 0, sizeof(mat_ref)); |
| memset(mat_test, 0, sizeof(mat_test)); |
| memset(vec_ref, 0, sizeof(vec_ref)); |
| memset(vec_test, 0, sizeof(vec_test)); |
| const int knumIter = 1000000; |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < knumIter; count++) { |
| ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_ref, vec_ref); |
| } |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < knumIter; count++) { |
| test_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_test, vec_test); |
| } |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf( |
| "Block size: %dx%d, ref_time = %d \t simd_time = %d \t Scaling = %4.2f " |
| "\n", |
| bw, bh, total_time_ref, total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| int16_t *gx_; |
| int16_t *gy_; |
| int16_t *pdiff_; |
| }; |
| |
| TEST_P(AV1CalcAffineAutocorrelationMatrixTest, CheckOutput) { RunTest(0); } |
| TEST_P(AV1CalcAffineAutocorrelationMatrixTest, DISABLED_Speed) { RunTest(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1CalcAffineAutocorrelationMatrixTest, |
| BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_c)); |
| |
| #if HAVE_AVX2 |
| INSTANTIATE_TEST_SUITE_P( |
| AVX2, AV1CalcAffineAutocorrelationMatrixTest, |
| BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_avx2)); |
| #endif // HAVE_AVX2 |
| |
| #if AFFINE_AVERAGING_BITS > 0 |
| typedef void (*av1_avg_pooling_pdiff_gradients_fun)( |
| int16_t *pdiff, const int pstride, int16_t *gx, int16_t *gy, |
| const int gstride, const int bw, const int bh, const int n); |
| |
| class AV1AvgPoolingPdiffGradientTest |
| : public AV1OptFlowTest<av1_avg_pooling_pdiff_gradients_fun> { |
| public: |
| AV1AvgPoolingPdiffGradientTest() { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| |
| gx_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gy_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| pdiff_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gx_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| gy_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| pdiff_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t)); |
| } |
| |
| ~AV1AvgPoolingPdiffGradientTest() { |
| aom_free(gx_avg1_); |
| aom_free(gy_avg1_); |
| aom_free(pdiff_avg1_); |
| aom_free(gx_avg2_); |
| aom_free(gy_avg2_); |
| aom_free(pdiff_avg2_); |
| } |
| |
| void RunTest(const int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bd = GetParam().BitDepth(); |
| const int bw_log2 = block.Width() >> MI_SIZE_LOG2; |
| const int bh_log2 = block.Height() >> MI_SIZE_LOG2; |
| const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2); |
| |
| // AVX2 version only supports avg pooling from larger size to 16x16 |
| if (block.Width() <= 8 || block.Height() <= 8) return; |
| |
| for (int count = 0; count < numIter;) { |
| RandomInput16(gx_avg1_, GetParam(), 16); |
| RandomInput16(gy_avg1_, GetParam(), 16); |
| RandomInput16(pdiff_avg1_, GetParam(), bd + 1); |
| memcpy(gx_avg2_, gx_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| memcpy(gy_avg2_, gy_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| memcpy(pdiff_avg2_, pdiff_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| |
| TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_, |
| gx_avg2_, gy_avg2_, is_speed); |
| count++; |
| } |
| if (is_speed) return; |
| |
| // Extreme value test |
| for (int count = 0; count < numIter; count++) { |
| RandomInput16Extreme(gx_avg1_, GetParam(), 16); |
| RandomInput16Extreme(gy_avg1_, GetParam(), 16); |
| RandomInput16Extreme(pdiff_avg1_, GetParam(), bd + 1); |
| memcpy(gx_avg2_, gx_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| memcpy(gy_avg2_, gy_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| memcpy(pdiff_avg2_, pdiff_avg1_, |
| sizeof(int16_t) * block.Width() * block.Height()); |
| |
| TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_, |
| gx_avg2_, gy_avg2_, is_speed); |
| } |
| } |
| |
| private: |
| void TestAvgPoolingPdiffGrad(int16_t *pdiff_avg1, int16_t *gx_avg1, |
| int16_t *gy_avg1, int16_t *pdiff_avg2, |
| int16_t *gx_avg2, int16_t *gy_avg2, |
| const int is_speed) { |
| const BlockSize &block = GetParam().Block(); |
| const int bw = block.Width(); |
| const int bh = block.Height(); |
| int pstride = bw; |
| int gstride = bw; |
| |
| av1_avg_pooling_pdiff_gradients_fun ref_func = |
| av1_avg_pooling_pdiff_gradients_c; |
| av1_avg_pooling_pdiff_gradients_fun test_func = GetParam().TestFunction(); |
| |
| if (is_speed) |
| AvgPoolingPdiffGradSpeed(ref_func, test_func, pstride, gstride, bw, bh, |
| pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2, |
| gx_avg2, gy_avg2); |
| else |
| AvgPoolingPdiffGrad(ref_func, test_func, pstride, gstride, bw, bh, |
| pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2, gx_avg2, |
| gy_avg2); |
| } |
| |
| void AvgPoolingPdiffGrad(av1_avg_pooling_pdiff_gradients_fun ref_func, |
| av1_avg_pooling_pdiff_gradients_fun test_func, |
| int pstride, int gstride, int bw, int bh, |
| int16_t *pdiff_avg1, int16_t *gx_avg1, |
| int16_t *gy_avg1, int16_t *pdiff_avg2, |
| int16_t *gx_avg2, int16_t *gy_avg2) { |
| int n = AOMMIN(AOMMIN(bw, bh), 16); |
| ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n); |
| test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n); |
| AssertOutputBufferEq(pdiff_avg1, pdiff_avg2, n, n, bw); |
| AssertOutputBufferEq(gx_avg1, gx_avg2, n, n, bw); |
| AssertOutputBufferEq(gy_avg1, gy_avg2, n, n, bw); |
| } |
| |
| void AvgPoolingPdiffGradSpeed(av1_avg_pooling_pdiff_gradients_fun ref_func, |
| av1_avg_pooling_pdiff_gradients_fun test_func, |
| int pstride, int gstride, int bw, int bh, |
| int16_t *pdiff_avg1, int16_t *gx_avg1, |
| int16_t *gy_avg1, int16_t *pdiff_avg2, |
| int16_t *gx_avg2, int16_t *gy_avg2) { |
| int n = AOMMIN(AOMMIN(bw, bh), 16); |
| |
| const int numIter = 1000000; |
| aom_usec_timer timer_ref; |
| aom_usec_timer timer_test; |
| |
| aom_usec_timer_start(&timer_ref); |
| for (int count = 0; count < numIter; count++) { |
| ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n); |
| } |
| aom_usec_timer_mark(&timer_ref); |
| |
| aom_usec_timer_start(&timer_test); |
| for (int count = 0; count < numIter; count++) { |
| test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n); |
| } |
| aom_usec_timer_mark(&timer_test); |
| |
| const int total_time_ref = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_ref)); |
| const int total_time_test = |
| static_cast<int>(aom_usec_timer_elapsed(&timer_test)); |
| |
| printf( |
| "Block size: %dx%d, C time = %d \t SIMD time = %d \t Scaling = %4.2f " |
| "\n", |
| bw, bh, total_time_ref, total_time_test, |
| (static_cast<float>(total_time_ref) / |
| static_cast<float>(total_time_test))); |
| } |
| |
| int16_t *gx_avg1_; |
| int16_t *gy_avg1_; |
| int16_t *pdiff_avg1_; |
| int16_t *gx_avg2_; |
| int16_t *gy_avg2_; |
| int16_t *pdiff_avg2_; |
| }; |
| |
| TEST_P(AV1AvgPoolingPdiffGradientTest, CheckOutput) { RunTest(0); } |
| TEST_P(AV1AvgPoolingPdiffGradientTest, DISABLED_Speed) { RunTest(1); } |
| |
| INSTANTIATE_TEST_SUITE_P( |
| C, AV1AvgPoolingPdiffGradientTest, |
| BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_c)); |
| |
| #if HAVE_AVX2 |
| INSTANTIATE_TEST_SUITE_P( |
| AVX2, AV1AvgPoolingPdiffGradientTest, |
| BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_avx2)); |
| #endif // HAVE_AVX2 |
| #endif // AFFINE_AVERAGING_BITS > 0 |
| #endif // OPFL_COMBINE_INTERP_GRAD_LS |
| #endif // CONFIG_AFFINE_REFINEMENT |
| } // namespace |
| |
| #endif // CONFIG_OPTFLOW_REFINEMENT |