Add AVX2 variant for sub_pixel_variance width 16
Added AVX2 variant for 16x64,16x32,16x16,16x8,16x4 blk_sizes
of sub_pixel_variance function.
Test bench level scaling gains are improved by 28% on average
w.r.t. SSSE3 module.
Change-Id: Iaf685ec819f0acd4a190e16c9f29319788aab899
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 904044d..b7d5a41 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1161,9 +1161,9 @@
specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x32 avx2 msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x16 avx2 neon msa sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x8 avx2 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/;
specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/;
@@ -1195,10 +1195,10 @@
specialize qw/aom_variance64x16 sse2 avx2/;
specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 6371e2e..c4919ba 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -234,6 +234,10 @@
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
int height, unsigned int *sse);
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse);
unsigned int aom_sub_pixel_avg_variance32xh_avx2(
const uint8_t *src, int src_stride, int x_offset, int y_offset,
@@ -276,6 +280,11 @@
AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \
unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index aef012b..f779270 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -104,6 +104,65 @@
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ /* load source and destination of 2 rows and insert*/ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT \
+ /* load source and another source from next row */ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ /* load source and next row source from 1 byte onwards */ \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter) \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \
+ __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \
+ __m128i filter_128bit = _mm256_castsi256_si128(filter); \
+ __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter) \
+ /* filter the source */ \
+ src_lo = _mm_maddubs_epi16(src_lo, filter); \
+ src_hi = _mm_maddubs_epi16(src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+ src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+ \
+ /* divide source by 16 */ \
+ src_lo = _mm_srai_epi16(src_lo, 4); \
+ src_hi = _mm_srai_epi16(src_hi, 4);
+
unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
@@ -292,6 +351,244 @@
return sum;
}
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height, unsigned int *sse) {
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ for (i = 0; i < height; i += 2) {
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += (src_stride << 1);
+ dst += (dst_stride << 1);
+ }
+ // x_offset = 0 and y_offset = 4
+ } else if (y_offset == 4) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i += 2) {
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+ AVG_NEXT_SRC_INSERT(src_reg, src_stride)
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += (src_stride << 1);
+ dst += (dst_stride << 1);
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i += 2) {
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+ MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += (src_stride << 1);
+ dst += (dst_stride << 1);
+ }
+ }
+ // x_offset = 4 and y_offset = 0
+ } else if (x_offset == 4) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height; i += 2) {
+ LOAD_SRC_NEXT_BYTE_INSERT
+ LOAD_DST_INSERT
+ /* average between current and next stride source */
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += (src_stride << 1);
+ dst += (dst_stride << 1);
+ }
+ // x_offset = 4 and y_offset = 4
+ } else if (y_offset == 4) {
+ __m256i src_next_reg, src_avg, src_temp;
+ // load and insert source and next row source
+ LOAD_SRC_NEXT_BYTE_INSERT
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+ src += src_stride << 1;
+ for (i = 0; i < height - 2; i += 2) {
+ LOAD_SRC_NEXT_BYTE_INSERT
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+ src_temp = _mm256_avg_epu8(src_avg, src_temp);
+ LOAD_DST_INSERT
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_temp, zero_reg)
+ // save current source average
+ src_avg = src_next_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride << 1;
+ src += src_stride << 1;
+ }
+ // last 2 rows processing happens here
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+ src_next_reg = _mm256_permute2x128_si256(
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+ LOAD_DST_INSERT
+ src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ } else {
+ // x_offset = 4 and y_offset = bilin interpolation
+ __m256i filter, pw8, src_next_reg, src_avg, src_temp;
+ y_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load and insert source and next row source
+ LOAD_SRC_NEXT_BYTE_INSERT
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+ src += src_stride << 1;
+ for (i = 0; i < height - 2; i += 2) {
+ LOAD_SRC_NEXT_BYTE_INSERT
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+ LOAD_DST_INSERT
+ MERGE_WITH_SRC(src_avg, src_temp)
+ // save current source average
+ src_avg = src_next_reg;
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst += dst_stride << 1;
+ src += src_stride << 1;
+ }
+ // last 2 rows processing happens here
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+ src_next_reg = _mm256_permute2x128_si256(
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+ LOAD_DST_INSERT
+ MERGE_WITH_SRC(src_avg, src_next_reg)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height; i += 2) {
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+ MERGE_NEXT_SRC_INSERT(src_reg, 1)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += (src_stride << 1);
+ dst += (dst_stride << 1);
+ }
+ // x_offset = bilin interpolation and y_offset = 4
+ } else if (y_offset == 4) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ filter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load and insert source and next row source
+ LOAD_SRC_NEXT_BYTE_INSERT
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src += src_stride << 1;
+ for (i = 0; i < height - 2; i += 2) {
+ LOAD_SRC_NEXT_BYTE_INSERT
+ LOAD_DST_INSERT
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ src += src_stride << 1;
+ dst += dst_stride << 1;
+ }
+ // last 2 rows processing happens here
+ LOAD_SRC_MERGE_128BIT(filter)
+ LOAD_DST_INSERT
+ FILTER_SRC_128BIT(filter_128bit)
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+ src_next_reg = _mm256_permute2x128_si256(
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ } else {
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256(
+ (__m256i const *)(bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load and insert source and next row source
+ LOAD_SRC_NEXT_BYTE_INSERT
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src += src_stride << 1;
+ for (i = 0; i < height - 2; i += 2) {
+ LOAD_SRC_NEXT_BYTE_INSERT
+ LOAD_DST_INSERT
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+ FILTER_SRC(xfilter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+ // average between previous pack to the current
+ MERGE_WITH_SRC(src_pack, src_next_reg)
+ // filter the source
+ FILTER_SRC(yfilter)
+ src_pack = src_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ src += src_stride << 1;
+ dst += dst_stride << 1;
+ }
+ // last 2 rows processing happens here
+ LOAD_SRC_MERGE_128BIT(xfilter)
+ LOAD_DST_INSERT
+ FILTER_SRC_128BIT(filter_128bit)
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+ src_next_reg = _mm256_permute2x128_si256(
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+ MERGE_WITH_SRC(src_pack, src_next_reg)
+ FILTER_SRC(yfilter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ }
+ }
+ CALC_SUM_AND_SSE
+ _mm256_zeroupper();
+ return sum;
+}
+
unsigned int aom_sub_pixel_avg_variance32xh_avx2(
const uint8_t *src, int src_stride, int x_offset, int y_offset,
const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index d39cd5d..1458ece 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -805,7 +805,7 @@
}
}
- unsigned int sse1;
+ unsigned int sse1, sse2;
int run_time = 1000000000 / block_size();
aom_usec_timer timer;
@@ -818,8 +818,24 @@
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
- params_.bit_depth, elapsed_time);
+
+ aom_usec_timer timer_c;
+
+ aom_usec_timer_start(&timer_c);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y,
+ &sse2, use_high_bit_depth(), params_.bit_depth);
+ }
+ aom_usec_timer_mark(&timer_c);
+
+ const int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&timer_c));
+
+ printf(
+ "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n",
+ width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time,
+ elapsed_time_c / elapsed_time);
}
template <>
@@ -1076,6 +1092,7 @@
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
@@ -2272,7 +2289,12 @@
SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
- SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0)));
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
INSTANTIATE_TEST_SUITE_P(
AVX2, AvxSubpelAvgVarianceTest,