Add an SSE4.1 implementation of av1_highbd_convolve_2d_scale

For large blocks this is about 8x the speed of the C version. The code
needs SSE 4.1 for the PMULLD instruction that we use to do SIMD 32-bit

The patch uses av1_convolve_scale_test (written already to test the
low bit depth path) to make sure the optimised code matches the C

Change-Id: I9304d6bb3d2cb31390de93ed08ff1a852e3ace86
diff --git a/test/ b/test/
index ac5281e..9d8be88 100644
--- a/test/
+++ b/test/
@@ -398,4 +398,72 @@
                        ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+                                   int32_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int x_step_qn,
+                                   const int subpel_y_qn, const int y_step_qn,
+                                   ConvolveParams *conv_params, int bd);
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+    HighBDParams;
+class HighBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveScaleTest() {}
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const bool avg = GET_PARAM(4);
+    const int bd = GET_PARAM(5);
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    if (ref) {
+      av1_highbd_convolve_2d_scale_c(
+          src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+          &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+          &convolve_params_, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_, bd_);
+    }
+  }
+ private:
+  HighbdConvolveFunc tst_fun_;
+const int kBDs[] = { 8, 10, 12 };
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+    SSE4_1, HighBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
 }  // namespace