Highbd D45E intrapred SSE2/AVX2 speedup
Function SSE2 vs C AVX2 vs C
4x4 ~4.5x
4x8 ~4.5x
8x4 ~11.7x
8x8 ~12.7x
8x16 ~14.0x
16x8 ~21.7x
16x16 ~24.0x
16x32 ~28.7x
32x16 ~20.5x
32x32 ~24.4x
Change-Id: Iaca49727d8df17b7f793b774a8d51a401ef8a8d1
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index dc8d24a..11b55ca 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -246,6 +246,7 @@
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
else ()
set(AOM_DSP_COMMON_INTRIN_DSPR2
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 7037c0e..950db02 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -85,6 +85,7 @@
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
endif # CONFIG_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d07241a..f4f6c64 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -255,6 +255,17 @@
specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
+
+ specialize qw/aom_highbd_d45e_predictor_4x4 sse2/;
+ specialize qw/aom_highbd_d45e_predictor_4x8 sse2/;
+ specialize qw/aom_highbd_d45e_predictor_8x4 sse2/;
+ specialize qw/aom_highbd_d45e_predictor_8x8 sse2/;
+ specialize qw/aom_highbd_d45e_predictor_8x16 sse2/;
+ specialize qw/aom_highbd_d45e_predictor_16x8 avx2/;
+ specialize qw/aom_highbd_d45e_predictor_16x16 avx2/;
+ specialize qw/aom_highbd_d45e_predictor_16x32 avx2/;
+ specialize qw/aom_highbd_d45e_predictor_32x16 avx2/;
+ specialize qw/aom_highbd_d45e_predictor_32x32 avx2/;
} # CONFIG_HIGHBITDEPTH
#
diff --git a/aom_dsp/x86/highbd_intrapred_avx2.c b/aom_dsp/x86/highbd_intrapred_avx2.c
new file mode 100644
index 0000000..41b55c9
--- /dev/null
+++ b/aom_dsp/x86/highbd_intrapred_avx2.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// D45E_PRED
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
+ const __m256i *z) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a = _mm256_avg_epu16(*x, *z);
+ const __m256i b =
+ _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
+ return _mm256_avg_epu16(b, *y);
+}
+
+static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
+ const __m256i *a2, uint16_t **dst,
+ ptrdiff_t stride) {
+ const __m256i y = avg3_epu16(a0, a1, a2);
+ _mm256_storeu_si256((__m256i *)*dst, y);
+ *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 9);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
+ x0 = _mm256_insert_epi16(x0, above[23], 15);
+ const __m256i y = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 15);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ x2 = _mm256_insert_epi16(x2, above[31], 15);
+ const __m256i y = avg3_epu16(&x0, &x1, &x2);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+ d45e_w16(&x0, &x1, &x2, &dst, stride);
+ } while (i < 33);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+ x0 = _mm256_insert_epi16(x0, above[47], 15);
+ const __m256i y = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+ __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+ uint16_t *dst1 = dst;
+ uint16_t *dst2 = dst + 16;
+
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+ } while (i < 15);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i u = avg3_epu16(&x0, &x1, &x2);
+ _mm256_storeu_si256((__m256i *)dst1, u);
+
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
+ y2 = _mm256_insert_epi16(y2, above[47], 15);
+ u = avg3_epu16(&y0, &y1, &y2);
+ _mm256_storeu_si256((__m256i *)dst2, u);
+}
+
+void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+ __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+ __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+ __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+ uint16_t *dst1 = dst;
+ uint16_t *dst2 = dst + 16;
+
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x1, &x2, &x0, &dst1, stride);
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+ x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x2, &x0, &x1, &dst1, stride);
+ y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+ x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+ d45e_w16(&x0, &x1, &x2, &dst1, stride);
+ y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+ d45e_w16(&y0, &y1, &y2, &dst2, stride);
+ } while (i < 33);
+
+ x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+ __m256i u = avg3_epu16(&x1, &x2, &x0);
+ _mm256_storeu_si256((__m256i *)dst1, u);
+
+ y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
+ y0 = _mm256_insert_epi16(y0, above[63], 15);
+ u = avg3_epu16(&y1, &y2, &y0);
+ _mm256_storeu_si256((__m256i *)dst2, u);
+}
diff --git a/aom_dsp/x86/highbd_intrapred_sse2.c b/aom_dsp/x86/highbd_intrapred_sse2.c
index 7200044..691e166 100644
--- a/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -1094,3 +1094,163 @@
dst += stride;
_mm_storel_epi64((__m128i *)dst, row3);
}
+
+void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+ const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+ __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+ CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
+ const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+ (void)left;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+}
+
+void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i h76543210 = _mm_load_si128((const __m128i *)above);
+ __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
+ __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
+ __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
+ __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
+ __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
+ dst += stride;
+
+ // hcba98765
+ h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
+ h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
+ // hxcba9876
+ hx7654321 = _mm_srli_si128(h76543210, 2);
+ // hxxcba987
+ hx8765432 = _mm_srli_si128(h76543210, 4);
+ avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
+ _mm_storel_epi64((__m128i *)dst, avg3);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+}
+
+void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+ __m128i y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 3));
+ y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + 4));
+ y = avg3_epu16(&x2, &x0, &x1);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + 5));
+ x2 = _mm_insert_epi16(x2, above[11], 7);
+ y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+}
+
+static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
+ const __m128i *a2, uint16_t **dst,
+ ptrdiff_t stride) {
+ const __m128i y = avg3_epu16(a0, a1, a2);
+ _mm_storeu_si128((__m128i *)*dst, y);
+ *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+ } while (i < 9);
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 9));
+ x0 = _mm_insert_epi16(x0, above[15], 7);
+ const __m128i y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ __m128i x0 = _mm_load_si128((const __m128i *)above);
+ __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+ __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+ int i = 3;
+ do {
+ x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+ d45e_w8(&x0, &x1, &x2, &dst, stride);
+ } while (i < 15);
+
+ x0 = _mm_loadu_si128((const __m128i *)(above + 15));
+ __m128i y = avg3_epu16(&x1, &x2, &x0);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x1 = _mm_loadu_si128((const __m128i *)(above + 16));
+ y = avg3_epu16(&x2, &x0, &x1);
+ _mm_store_si128((__m128i *)dst, y);
+ dst += stride;
+
+ x2 = _mm_loadu_si128((const __m128i *)(above + 17));
+ x2 = _mm_insert_epi16(x2, above[23], 7);
+ y = avg3_epu16(&x0, &x1, &x2);
+ _mm_store_si128((__m128i *)dst, y);
+}