test/intra_matrix_test.cc - avm - Git at Google

 /*
  * Copyright (c) 2025, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <math.h>
 #include <stdlib.h>
 #include <string.h>

 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/register_state_check.h"
 #include "test/function_equivalence_test.h"

 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_ports/aom_timer.h"
 #include "av1/common/enums.h"
 #include "av1/common/intra_dip.h"
 #include "av1/common/intra_matrix.h"

 using libaom_test::FunctionEquivalenceTest;

 namespace {

 template <typename F, typename T>
 class IntraMatrixTest : public FunctionEquivalenceTest<F> {
  protected:
   static const int kIterations = 1000000;
   static const int kBufSize = 8 * 8;

   virtual ~IntraMatrixTest() {}

   virtual void Execute(T *dip_tst) = 0;

   void Common() {
     dip_ref_ = &dip_ref_data_[0];
     dip_tst_ = &dip_tst_data_[0];

     Execute(dip_tst_);

     for (int r = 0; r < kBufSize; ++r) {
       ASSERT_EQ(dip_ref_[r], dip_tst_[r]);
     }
   }

   T dip_arr_[DIP_ROWS * DIP_COLS];
   T dip_feat_[DIP_COLS];

   T dip_ref_data_[kBufSize];
   T dip_tst_data_[kBufSize];

   T *dip_ref_;
   T *dip_tst_;
 };

 //////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////

 typedef void (*IMHB)(const uint16_t *A, const uint16_t *B, uint16_t *C, int bd);
 typedef libaom_test::FuncParam<IMHB> IntraMatrixTestFuncsHBD;

 class IntraMatrixTestHB : public IntraMatrixTest<IMHB, uint16_t> {
  protected:
   void Execute(uint16_t *dip_tst) {
     params_.ref_func(dip_arr_, dip_feat_, dip_ref_, bit_depth_);
     ASM_REGISTER_STATE_CHECK(
         params_.tst_func(dip_arr_, dip_feat_, dip_tst, bit_depth_));
   }
   int bit_depth_;
 };

 TEST_P(IntraMatrixTestHB, RandomValues) {
   for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
     switch (rng_(3)) {
       case 0: bit_depth_ = 8; break;
       case 1: bit_depth_ = 10; break;
       default: bit_depth_ = 12; break;
     }
     const int hi = 1 << bit_depth_;

     for (int i = 0; i < 16; ++i) {
       dip_feat_[i] = rng_(hi);
     }
     int mode = iter % INTRA_DIP_MODE_CNT;
     for (int r = 0; r < DIP_ROWS; ++r) {
       for (int c = 0; c < DIP_FEATURES; ++c) {
         dip_arr_[r * DIP_COLS + c] = av1_intra_matrix_weights[mode][r][c];
       }
     }

     Common();
   }
 }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, IntraMatrixTestHB,
                          ::testing::Values(IntraMatrixTestFuncsHBD(
                              av1_dip_matrix_multiplication_c,
                              av1_dip_matrix_multiplication_avx2)));
 #endif  // HAVE_AVX2

 // Speed tests

 TEST_P(IntraMatrixTestHB, DISABLED_Speed) {
   const int test_count = 10000000;
   bit_depth_ = 12;
   const int hi = 1 << bit_depth_;
   for (int i = 0; i < 16; ++i) {
     dip_feat_[i] = rng_(hi);
   }
   for (int r = 0; r < 64; ++r) {
     for (int c = 0; c < 11; ++c) {
       dip_arr_[r * 16 + c] = av1_intra_matrix_weights[0][r][c];
     }
   }
   dip_tst_ = &dip_tst_data_[0];
   for (int iter = 0; iter < test_count; ++iter) {
     ASM_REGISTER_STATE_CHECK(
         params_.tst_func(dip_arr_, dip_feat_, dip_tst_, bit_depth_));
   }
 }

 }  // namespace

 //////////////////////////////////////////////////////////////////////////////
 // ResampleOutputTest
 //////////////////////////////////////////////////////////////////////////////

 typedef void (*ResampleOutputFunc)(uint16_t *dst, int dst_stride,
                                    const uint16_t *above_row,
                                    const uint16_t *left_col,
                                    uint16_t *ml_output, int bw_log2,
                                    int bh_log2, int transpose);

 typedef libaom_test::FuncParam<ResampleOutputFunc> ResampleOutputTestFuncs;

 class ResampleOutputTest : public FunctionEquivalenceTest<ResampleOutputFunc> {
  protected:
   static const int kMaxWidth = 64;
   static const int kMaxHeight = 64;
   static const int kBufSize = kMaxWidth * kMaxHeight;
   static const int kMlOutputSize = 8 * 8;
   static const int kContextSize = kMaxWidth + kMaxHeight + 1;

   ResampleOutputTest() {
     dst_ref_ = &dst_ref_data_[0];
     dst_tst_ = &dst_tst_data_[0];
     above_row_ = &context_data_[1];
     left_col_ = &context_data_[kMaxWidth + 2];
     ml_output_ = &ml_output_data_[0];
   }

   virtual ~ResampleOutputTest() {}

   int get_log2(int val) {
     switch (val) {
       case 4: return 2;
       case 8: return 3;
       case 16: return 4;
       case 32: return 5;
       case 64: return 6;
       default: EXPECT_TRUE(false) << "Invalid block size"; return 0;
     }
   }

   void RunCorrectnessTest() {
     const int block_sizes[] = { 8, 16, 32, 64 };
     for (int bw : block_sizes) {
       for (int bh : block_sizes) {
         // Data-driven intra prediction only applies to blocks with w*h >= 128.
         if (bw * bh < 128) continue;
         for (int transpose = 0; transpose < 2; ++transpose) {
           const int bw_log2 = get_log2(bw);
           const int bh_log2 = get_log2(bh);
           const int dst_stride = kMaxWidth;
           const int bit_depth = 12;
           const int hi = (1 << bit_depth) - 1;

           for (int i = 0; i < kContextSize; ++i) {
             context_data_[i] = rng_(hi);
           }
           for (int i = 0; i < kMlOutputSize; ++i) {
             // The range of ml_output is clipped to the corresponding bitdepth.
             // i.e. v = clip_pixel_highbd(v, bit_depth);
             // See av1_dip_matrix_mulplication.
             ml_output_data_[i] = rng_(hi);
           }
           // The top-left corner is shared between above_row[-1] and
           // left_col[-1]
           above_row_[-1] = context_data_[0];
           left_col_[-1] = context_data_[0];

           params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_,
                            ml_output_, bw_log2, bh_log2, transpose);
           ASM_REGISTER_STATE_CHECK(
               params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_,
                                ml_output_, bw_log2, bh_log2, transpose));

           for (int r = 0; r < bh; ++r) {
             for (int c = 0; c < bw; ++c) {
               ASSERT_EQ(dst_ref_[r * dst_stride + c],
                         dst_tst_[r * dst_stride + c])
                   << "Mismatch at (" << c << ", " << r << ") for block size "
                   << bw << "x" << bh << " (transpose=" << transpose << ")";
             }
           }
         }
       }
     }
   }

   void RunSpeedTest() {
     const int block_sizes[] = { 8, 16, 32, 64 };
     for (int bw : block_sizes) {
       for (int bh : block_sizes) {
         // Data-driven intra prediction only applies to blocks with w*h >= 128.
         if (bw * bh < 128) continue;
         for (int transpose = 0; transpose < 2; ++transpose) {
           const int bw_log2 = get_log2(bw);
           const int bh_log2 = get_log2(bh);
           const int dst_stride = kMaxWidth;
           const int bit_depth = 12;
           const int hi = (1 << bit_depth) - 1;
           const int kIterations = 100000;

           for (int i = 0; i < kContextSize; ++i) {
             context_data_[i] = rng_(hi);
           }
           for (int i = 0; i < kMlOutputSize; ++i) {
             // The range of ml_output is clipped to the corresponding bitdepth.
             // i.e. v = clip_pixel_highbd(v, bit_depth);
             // See av1_dip_matrix_mulplication.
             ml_output_data_[i] = rng_(hi);
           }
           above_row_[-1] = context_data_[0];
           left_col_[-1] = context_data_[0];

           aom_usec_timer ref_timer, tst_timer;

           aom_usec_timer_start(&ref_timer);
           for (int i = 0; i < kIterations; ++i) {
             params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_,
                              ml_output_, bw_log2, bh_log2, transpose);
           }
           aom_usec_timer_mark(&ref_timer);
           const double ref_time =
               static_cast<double>(aom_usec_timer_elapsed(&ref_timer));

           aom_usec_timer_start(&tst_timer);
           for (int i = 0; i < kIterations; ++i) {
             params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_,
                              ml_output_, bw_log2, bh_log2, transpose);
           }
           aom_usec_timer_mark(&tst_timer);
           const double tst_time =
               static_cast<double>(aom_usec_timer_elapsed(&tst_timer));

           printf(
               "Block %2dx%2d (T=%d): C time = %7.2f us, SIMD time = %7.2f us, "
               "Speedup = %4.2fx\n",
               bw, bh, transpose, ref_time, tst_time, ref_time / tst_time);
         }
       }
     }
   }

   uint16_t dst_ref_data_[kBufSize];
   uint16_t dst_tst_data_[kBufSize];
   uint16_t context_data_[kContextSize];
   uint16_t ml_output_data_[kMlOutputSize];

   uint16_t *dst_ref_;
   uint16_t *dst_tst_;
   uint16_t *above_row_;
   uint16_t *left_col_;
   uint16_t *ml_output_;
 };

 TEST_P(ResampleOutputTest, Correctness) { RunCorrectnessTest(); }

 TEST_P(ResampleOutputTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, ResampleOutputTest,
                          ::testing::Values(ResampleOutputTestFuncs(
                              resample_output_c, resample_output_avx2)));
 #endif  // HAVE_AVX2
	/*
	* Copyright (c) 2025, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include <math.h>
	#include <stdlib.h>
	#include <string.h>

	#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
	#include "test/register_state_check.h"
	#include "test/function_equivalence_test.h"

	#include "config/aom_config.h"
	#include "config/aom_dsp_rtcd.h"
	#include "config/av1_rtcd.h"

	#include "aom/aom_integer.h"
	#include "aom_ports/aom_timer.h"
	#include "av1/common/enums.h"
	#include "av1/common/intra_dip.h"
	#include "av1/common/intra_matrix.h"

	using libaom_test::FunctionEquivalenceTest;

	namespace {

	template <typename F, typename T>
	class IntraMatrixTest : public FunctionEquivalenceTest<F> {
	protected:
	static const int kIterations = 1000000;
	static const int kBufSize = 8 * 8;

	virtual ~IntraMatrixTest() {}

	virtual void Execute(T *dip_tst) = 0;

	void Common() {
	dip_ref_ = &dip_ref_data_[0];
	dip_tst_ = &dip_tst_data_[0];

	Execute(dip_tst_);

	for (int r = 0; r < kBufSize; ++r) {
	ASSERT_EQ(dip_ref_[r], dip_tst_[r]);
	}
	}

	T dip_arr_[DIP_ROWS * DIP_COLS];
	T dip_feat_[DIP_COLS];

	T dip_ref_data_[kBufSize];
	T dip_tst_data_[kBufSize];

	T *dip_ref_;
	T *dip_tst_;
	};

	//////////////////////////////////////////////////////////////////////////////
	// High bit-depth version
	//////////////////////////////////////////////////////////////////////////////

	typedef void (IMHB)(const uint16_t A, const uint16_t B, uint16_t C, int bd);
	typedef libaom_test::FuncParam<IMHB> IntraMatrixTestFuncsHBD;

	class IntraMatrixTestHB : public IntraMatrixTest<IMHB, uint16_t> {
	protected:
	void Execute(uint16_t *dip_tst) {
	params_.ref_func(dip_arr_, dip_feat_, dip_ref_, bit_depth_);
	ASM_REGISTER_STATE_CHECK(
	params_.tst_func(dip_arr_, dip_feat_, dip_tst, bit_depth_));
	}
	int bit_depth_;
	};

	TEST_P(IntraMatrixTestHB, RandomValues) {
	for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
	switch (rng_(3)) {
	case 0: bit_depth_ = 8; break;
	case 1: bit_depth_ = 10; break;
	default: bit_depth_ = 12; break;
	}
	const int hi = 1 << bit_depth_;

	for (int i = 0; i < 16; ++i) {
	dip_feat_[i] = rng_(hi);
	}
	int mode = iter % INTRA_DIP_MODE_CNT;
	for (int r = 0; r < DIP_ROWS; ++r) {
	for (int c = 0; c < DIP_FEATURES; ++c) {
	dip_arr_[r * DIP_COLS + c] = av1_intra_matrix_weights[mode][r][c];
	}
	}

	Common();
	}
	}

	#if HAVE_AVX2
	INSTANTIATE_TEST_SUITE_P(AVX2, IntraMatrixTestHB,
	::testing::Values(IntraMatrixTestFuncsHBD(
	av1_dip_matrix_multiplication_c,
	av1_dip_matrix_multiplication_avx2)));
	#endif // HAVE_AVX2

	// Speed tests

	TEST_P(IntraMatrixTestHB, DISABLED_Speed) {
	const int test_count = 10000000;
	bit_depth_ = 12;
	const int hi = 1 << bit_depth_;
	for (int i = 0; i < 16; ++i) {
	dip_feat_[i] = rng_(hi);
	}
	for (int r = 0; r < 64; ++r) {
	for (int c = 0; c < 11; ++c) {
	dip_arr_[r * 16 + c] = av1_intra_matrix_weights[0][r][c];
	}
	}
	dip_tst_ = &dip_tst_data_[0];
	for (int iter = 0; iter < test_count; ++iter) {
	ASM_REGISTER_STATE_CHECK(
	params_.tst_func(dip_arr_, dip_feat_, dip_tst_, bit_depth_));
	}
	}

	} // namespace

	//////////////////////////////////////////////////////////////////////////////
	// ResampleOutputTest
	//////////////////////////////////////////////////////////////////////////////

	typedef void (ResampleOutputFunc)(uint16_t dst, int dst_stride,
	const uint16_t *above_row,
	const uint16_t *left_col,
	uint16_t *ml_output, int bw_log2,
	int bh_log2, int transpose);

	typedef libaom_test::FuncParam<ResampleOutputFunc> ResampleOutputTestFuncs;

	class ResampleOutputTest : public FunctionEquivalenceTest<ResampleOutputFunc> {
	protected:
	static const int kMaxWidth = 64;
	static const int kMaxHeight = 64;
	static const int kBufSize = kMaxWidth * kMaxHeight;
	static const int kMlOutputSize = 8 * 8;
	static const int kContextSize = kMaxWidth + kMaxHeight + 1;

	ResampleOutputTest() {
	dst_ref_ = &dst_ref_data_[0];
	dst_tst_ = &dst_tst_data_[0];
	above_row_ = &context_data_[1];
	left_col_ = &context_data_[kMaxWidth + 2];
	ml_output_ = &ml_output_data_[0];
	}

	virtual ~ResampleOutputTest() {}

	int get_log2(int val) {
	switch (val) {
	case 4: return 2;
	case 8: return 3;
	case 16: return 4;
	case 32: return 5;
	case 64: return 6;
	default: EXPECT_TRUE(false) << "Invalid block size"; return 0;
	}
	}

	void RunCorrectnessTest() {
	const int block_sizes[] = { 8, 16, 32, 64 };
	for (int bw : block_sizes) {
	for (int bh : block_sizes) {
	// Data-driven intra prediction only applies to blocks with w*h >= 128.
	if (bw * bh < 128) continue;
	for (int transpose = 0; transpose < 2; ++transpose) {
	const int bw_log2 = get_log2(bw);
	const int bh_log2 = get_log2(bh);
	const int dst_stride = kMaxWidth;
	const int bit_depth = 12;
	const int hi = (1 << bit_depth) - 1;

	for (int i = 0; i < kContextSize; ++i) {
	context_data_[i] = rng_(hi);
	}
	for (int i = 0; i < kMlOutputSize; ++i) {
	// The range of ml_output is clipped to the corresponding bitdepth.
	// i.e. v = clip_pixel_highbd(v, bit_depth);
	// See av1_dip_matrix_mulplication.
	ml_output_data_[i] = rng_(hi);
	}
	// The top-left corner is shared between above_row[-1] and
	// left_col[-1]
	above_row_[-1] = context_data_[0];
	left_col_[-1] = context_data_[0];

	params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_,
	ml_output_, bw_log2, bh_log2, transpose);
	ASM_REGISTER_STATE_CHECK(
	params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_,
	ml_output_, bw_log2, bh_log2, transpose));

	for (int r = 0; r < bh; ++r) {
	for (int c = 0; c < bw; ++c) {
	ASSERT_EQ(dst_ref_[r * dst_stride + c],
	dst_tst_[r * dst_stride + c])
	<< "Mismatch at (" << c << ", " << r << ") for block size "
	<< bw << "x" << bh << " (transpose=" << transpose << ")";
	}
	}
	}
	}
	}
	}

	void RunSpeedTest() {
	const int block_sizes[] = { 8, 16, 32, 64 };
	for (int bw : block_sizes) {
	for (int bh : block_sizes) {
	// Data-driven intra prediction only applies to blocks with w*h >= 128.
	if (bw * bh < 128) continue;
	for (int transpose = 0; transpose < 2; ++transpose) {
	const int bw_log2 = get_log2(bw);
	const int bh_log2 = get_log2(bh);
	const int dst_stride = kMaxWidth;
	const int bit_depth = 12;
	const int hi = (1 << bit_depth) - 1;
	const int kIterations = 100000;

	for (int i = 0; i < kContextSize; ++i) {
	context_data_[i] = rng_(hi);
	}
	for (int i = 0; i < kMlOutputSize; ++i) {
	// The range of ml_output is clipped to the corresponding bitdepth.
	// i.e. v = clip_pixel_highbd(v, bit_depth);
	// See av1_dip_matrix_mulplication.
	ml_output_data_[i] = rng_(hi);
	}
	above_row_[-1] = context_data_[0];
	left_col_[-1] = context_data_[0];

	aom_usec_timer ref_timer, tst_timer;

	aom_usec_timer_start(&ref_timer);
	for (int i = 0; i < kIterations; ++i) {
	params_.ref_func(dst_ref_, dst_stride, above_row_, left_col_,
	ml_output_, bw_log2, bh_log2, transpose);
	}
	aom_usec_timer_mark(&ref_timer);
	const double ref_time =
	static_cast<double>(aom_usec_timer_elapsed(&ref_timer));

	aom_usec_timer_start(&tst_timer);
	for (int i = 0; i < kIterations; ++i) {
	params_.tst_func(dst_tst_, dst_stride, above_row_, left_col_,
	ml_output_, bw_log2, bh_log2, transpose);
	}
	aom_usec_timer_mark(&tst_timer);
	const double tst_time =
	static_cast<double>(aom_usec_timer_elapsed(&tst_timer));

	printf(
	"Block %2dx%2d (T=%d): C time = %7.2f us, SIMD time = %7.2f us, "
	"Speedup = %4.2fx\n",
	bw, bh, transpose, ref_time, tst_time, ref_time / tst_time);
	}
	}
	}
	}

	uint16_t dst_ref_data_[kBufSize];
	uint16_t dst_tst_data_[kBufSize];
	uint16_t context_data_[kContextSize];
	uint16_t ml_output_data_[kMlOutputSize];

	uint16_t *dst_ref_;
	uint16_t *dst_tst_;
	uint16_t *above_row_;
	uint16_t *left_col_;
	uint16_t *ml_output_;
	};

	TEST_P(ResampleOutputTest, Correctness) { RunCorrectnessTest(); }

	TEST_P(ResampleOutputTest, DISABLED_Speed) { RunSpeedTest(); }

	#if HAVE_AVX2
	INSTANTIATE_TEST_SUITE_P(AVX2, ResampleOutputTest,
	::testing::Values(ResampleOutputTestFuncs(
	resample_output_c, resample_output_avx2)));
	#endif // HAVE_AVX2