Add an SSE4.1 implementation of av1_highbd_convolve_2d_scale
For large blocks this is about 8x the speed of the C version. The code
needs SSE 4.1 for the PMULLD instruction that we use to do SIMD 32-bit
multiplies.
The patch uses av1_convolve_scale_test (written already to test the
low bit depth path) to make sure the optimised code matches the C
version.
Change-Id: I9304d6bb3d2cb31390de93ed08ff1a852e3ace86
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 4834200..f5bf46b 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -650,7 +650,11 @@
specialize qw/av1_highbd_convolve_2d ssse3/;
add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
specialize qw/av1_highbd_convolve_rounding avx2/;
+
add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+ if (aom_config("CONFIG_COMPOUND_ROUND") ne "yes") {
+ specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+ }
}
}
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 1f7643d..1f0fedb 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -244,8 +244,7 @@
static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
int dst_stride, int w, int h, int subpel_y_qn,
int y_step_qn, const InterpFilterParams *filter_params,
- const ConvolveParams *conv_params) {
- const int bd = 8;
+ const ConvolveParams *conv_params, int bd) {
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int ntaps = filter_params->taps;
@@ -331,8 +330,7 @@
static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
int dst_stride, int w, int h, int subpel_y_qn,
int y_step_qn, const InterpFilterParams *filter_params,
- const ConvolveParams *conv_params) {
- const int bd = 8;
+ const ConvolveParams *conv_params, int bd) {
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int ntaps = 8;
@@ -433,8 +431,215 @@
// vertical filter (input is transposed)
if (ytaps == 8)
vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params);
+ filter_params_y, conv_params, 8);
else
vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params);
+ filter_params_y, conv_params, 8);
}
+
+#if CONFIG_HIGHBITDEPTH
+// An wrapper to generate the SHUFPD instruction with __m128i types (just
+// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
+// casts)
+static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
+ __m128d ad = _mm_castsi128_pd(a);
+ __m128d bd = _mm_castsi128_pd(b);
+ return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
+}
+
+// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
+// is the more general version, supporting 10 and 12 tap filters. For
+// 8-tap filters, use hfilter8.
+static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params,
+ unsigned round, int bd) {
+ const int ntaps = filter_params->taps;
+ assert(ntaps == 10 || ntaps == 12);
+
+ src -= ntaps / 2 - 1;
+
+ // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
+ // out the unneeded entries.
+ const __m128i hicoeff_mask = make_1012_mask(ntaps);
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = extend_32_to_128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+ // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
+ // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
+ // are masked out with hicoeff_mask.
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+ const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint16_t *const src0 = src_col + y * src_stride;
+ const uint16_t *const src1 = src0 + 1 * src_stride;
+ const uint16_t *const src2 = src0 + 2 * src_stride;
+ const uint16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 16-bit input data, so each load gets 8
+ // pixels (we need at most 12)
+ const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+ const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+ const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+ const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+ const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
+ const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
+ const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
+ const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
+
+ // The "hi" data has rubbish in the top half so interleave pairs together
+ // to minimise the calculation we need to do.
+ const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
+ const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+ const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
+ const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
+ const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
+ const __m128i conv = _mm_add_epi32(convlo, convhi);
+
+ // Divide down by (1 << round), rounding to nearest.
+ const __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ // Write transposed to the output
+ _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint16_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params,
+ unsigned round, int bd) {
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = extend_32_to_128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint16_t *const src0 = src_col + y * src_stride;
+ const uint16_t *const src1 = src0 + 1 * src_stride;
+ const uint16_t *const src2 = src0 + 2 * src_stride;
+ const uint16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 16-bit input data, so each load gets the 8
+ // pixels we need.
+ const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+ const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+ const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+ const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ const __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ // Write transposed to the output
+ _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint16_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+ const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
+ int w, int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+
+ // horizontal filter
+ if (xtaps == 8)
+ highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x,
+ conv_params->round_0, bd);
+ else
+ highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x,
+ conv_params->round_0, bd);
+
+ // vertical filter (input is transposed)
+ if (ytaps == 8)
+ vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
+ else
+ vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
+}
+#endif // CONFIG_HIGHBITDEPTH
diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index ac5281e..9d8be88 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc
@@ -398,4 +398,72 @@
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
::testing::Bool()));
+
+#if CONFIG_HIGHBITDEPTH
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+ int32_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+// <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+ HighBDParams;
+
+class HighBDConvolveScaleTest
+ : public ConvolveScaleTestBase<uint16_t>,
+ public ::testing::WithParamInterface<HighBDParams> {
+ public:
+ virtual ~HighBDConvolveScaleTest() {}
+
+ void SetUp() {
+ tst_fun_ = GET_PARAM(0);
+
+ const BlockDimension &block = GET_PARAM(1);
+ const NTaps ntaps_x = GET_PARAM(2);
+ const NTaps ntaps_y = GET_PARAM(3);
+ const bool avg = GET_PARAM(4);
+ const int bd = GET_PARAM(5);
+
+ SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+ }
+
+ void RunOne(bool ref) {
+ const uint16_t *src = image_->GetSrcData(ref, false);
+ CONV_BUF_TYPE *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+
+ if (ref) {
+ av1_highbd_convolve_2d_scale_c(
+ src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+ &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+ &convolve_params_, bd_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+ &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+ subpel_y_, kYStepQn, &convolve_params_, bd_);
+ }
+ }
+
+ private:
+ HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, HighBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+
+#endif // CONFIG_HIGHBITDEPTH
} // namespace