Add an SSE4.1 implementation of av1_highbd_convolve_2d_scale
For large blocks this is about 8x the speed of the C version. The code
needs SSE 4.1 for the PMULLD instruction that we use to do SIMD 32-bit
multiplies.
The patch uses av1_convolve_scale_test (written already to test the
low bit depth path) to make sure the optimised code matches the C
version.
Change-Id: I9304d6bb3d2cb31390de93ed08ff1a852e3ace86
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index ac5281e..9d8be88 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -398,4 +398,72 @@
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
::testing::Bool()));
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+ int32_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+// <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+ HighBDParams;
+
+class HighBDConvolveScaleTest
+ : public ConvolveScaleTestBase<uint16_t>,
+ public ::testing::WithParamInterface<HighBDParams> {
+ public:
+ virtual ~HighBDConvolveScaleTest() {}
+
+ void SetUp() {
+ tst_fun_ = GET_PARAM(0);
+
+ const BlockDimension &block = GET_PARAM(1);
+ const NTaps ntaps_x = GET_PARAM(2);
+ const NTaps ntaps_y = GET_PARAM(3);
+ const bool avg = GET_PARAM(4);
+ const int bd = GET_PARAM(5);
+
+ SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+ }
+
+ void RunOne(bool ref) {
+ const uint16_t *src = image_->GetSrcData(ref, false);
+ CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+
+ if (ref) {
+ av1_highbd_convolve_2d_scale_c(
+ src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+ &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+ &convolve_params_, bd_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+ &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+ subpel_y_, kYStepQn, &convolve_params_, bd_);
+ }
+ }
+
+ private:
+ HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, HighBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+
+#endif // CONFIG_HIGHBITDEPTH
} // namespace