Highbd D45E intrapred SSE2/AVX2 speedup

Function  SSE2 vs C  AVX2 vs C
4x4       ~4.5x
4x8       ~4.5x
8x4       ~11.7x
8x8       ~12.7x
8x16      ~14.0x
16x8                 ~21.7x
16x16                ~24.0x
16x32                ~28.7x
32x16                ~20.5x
32x32                ~24.4x

Change-Id: Iaca49727d8df17b7f793b774a8d51a401ef8a8d1
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index dc8d24a..11b55ca 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -246,6 +246,7 @@
   set(AOM_DSP_COMMON_INTRIN_AVX2
       ${AOM_DSP_COMMON_INTRIN_AVX2}
       "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
       "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
 else ()
   set(AOM_DSP_COMMON_INTRIN_DSPR2
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 7037c0e..950db02 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -85,6 +85,7 @@
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
 DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
 endif  # CONFIG_HIGHBITDEPTH
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d07241a..f4f6c64 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -255,6 +255,17 @@
   specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
   specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
   specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
+
+  specialize qw/aom_highbd_d45e_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_d45e_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_d45e_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_d45e_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_d45e_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_d45e_predictor_16x8 avx2/;
+  specialize qw/aom_highbd_d45e_predictor_16x16 avx2/;
+  specialize qw/aom_highbd_d45e_predictor_16x32 avx2/;
+  specialize qw/aom_highbd_d45e_predictor_32x16 avx2/;
+  specialize qw/aom_highbd_d45e_predictor_32x32 avx2/;
 }  # CONFIG_HIGHBITDEPTH
 
 #
diff --git a/aom_dsp/x86/highbd_intrapred_avx2.c b/aom_dsp/x86/highbd_intrapred_avx2.c
new file mode 100644
index 0000000..41b55c9
--- /dev/null
+++ b/aom_dsp/x86/highbd_intrapred_avx2.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// D45E_PRED
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
+                                 const __m256i *z) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a = _mm256_avg_epu16(*x, *z);
+  const __m256i b =
+      _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
+  return _mm256_avg_epu16(b, *y);
+}
+
+static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
+                            const __m256i *a2, uint16_t **dst,
+                            ptrdiff_t stride) {
+  const __m256i y = avg3_epu16(a0, a1, a2);
+  _mm256_storeu_si256((__m256i *)*dst, y);
+  *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 9);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
+  x0 = _mm256_insert_epi16(x0, above[23], 15);
+  const __m256i y = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 15);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+  d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  x2 = _mm256_insert_epi16(x2, above[31], 15);
+  const __m256i y = avg3_epu16(&x0, &x1, &x2);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+
+  d45e_w16(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
+    d45e_w16(&x0, &x1, &x2, &dst, stride);
+  } while (i < 33);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+  x0 = _mm256_insert_epi16(x0, above[47], 15);
+  const __m256i y = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+  uint16_t *dst1 = dst;
+  uint16_t *dst2 = dst + 16;
+
+  d45e_w16(&x0, &x1, &x2, &dst1, stride);
+  d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x1, &x2, &x0, &dst1, stride);
+    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x2, &x0, &x1, &dst1, stride);
+    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x0, &x1, &x2, &dst1, stride);
+    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y0, &y1, &y2, &dst2, stride);
+  } while (i < 15);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
+  d45e_w16(&x1, &x2, &x0, &dst1, stride);
+  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
+  d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  d45e_w16(&x2, &x0, &x1, &dst1, stride);
+  y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
+  d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i u = avg3_epu16(&x0, &x1, &x2);
+  _mm256_storeu_si256((__m256i *)dst1, u);
+
+  y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
+  y2 = _mm256_insert_epi16(y2, above[47], 15);
+  u = avg3_epu16(&y0, &y1, &y2);
+  _mm256_storeu_si256((__m256i *)dst2, u);
+}
+
+void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
+  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
+  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
+  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
+  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
+
+  uint16_t *dst1 = dst;
+  uint16_t *dst2 = dst + 16;
+
+  d45e_w16(&x0, &x1, &x2, &dst1, stride);
+  d45e_w16(&y0, &y1, &y2, &dst2, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x1, &x2, &x0, &dst1, stride);
+    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y1, &y2, &y0, &dst2, stride);
+
+    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x2, &x0, &x1, &dst1, stride);
+    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y2, &y0, &y1, &dst2, stride);
+
+    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
+    d45e_w16(&x0, &x1, &x2, &dst1, stride);
+    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
+    d45e_w16(&y0, &y1, &y2, &dst2, stride);
+  } while (i < 33);
+
+  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
+  __m256i u = avg3_epu16(&x1, &x2, &x0);
+  _mm256_storeu_si256((__m256i *)dst1, u);
+
+  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
+  y0 = _mm256_insert_epi16(y0, above[63], 15);
+  u = avg3_epu16(&y1, &y2, &y0);
+  _mm256_storeu_si256((__m256i *)dst2, u);
+}
diff --git a/aom_dsp/x86/highbd_intrapred_sse2.c b/aom_dsp/x86/highbd_intrapred_sse2.c
index 7200044..691e166 100644
--- a/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -1094,3 +1094,163 @@
   dst += stride;
   _mm_storel_epi64((__m128i *)dst, row3);
 }
+
+void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+}
+
+void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m128i h76543210 = _mm_load_si128((const __m128i *)above);
+  __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
+  __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
+  __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
+  __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
+  __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
+  dst += stride;
+
+  // hcba98765
+  h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
+  h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
+  // hxcba9876
+  hx7654321 = _mm_srli_si128(h76543210, 2);
+  // hxxcba987
+  hx8765432 = _mm_srli_si128(h76543210, 4);
+  avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+}
+
+void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m128i x0 = _mm_load_si128((const __m128i *)above);
+  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+  __m128i y = avg3_epu16(&x0, &x1, &x2);
+  _mm_store_si128((__m128i *)dst, y);
+  dst += stride;
+
+  x0 = _mm_loadu_si128((const __m128i *)(above + 3));
+  y = avg3_epu16(&x1, &x2, &x0);
+  _mm_store_si128((__m128i *)dst, y);
+  dst += stride;
+
+  x1 = _mm_loadu_si128((const __m128i *)(above + 4));
+  y = avg3_epu16(&x2, &x0, &x1);
+  _mm_store_si128((__m128i *)dst, y);
+  dst += stride;
+
+  x2 = _mm_loadu_si128((const __m128i *)(above + 5));
+  x2 = _mm_insert_epi16(x2, above[11], 7);
+  y = avg3_epu16(&x0, &x1, &x2);
+  _mm_store_si128((__m128i *)dst, y);
+}
+
+static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
+                           const __m128i *a2, uint16_t **dst,
+                           ptrdiff_t stride) {
+  const __m128i y = avg3_epu16(a0, a1, a2);
+  _mm_storeu_si128((__m128i *)*dst, y);
+  *dst += stride;
+}
+
+void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m128i x0 = _mm_load_si128((const __m128i *)above);
+  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+  d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x0, &x1, &x2, &dst, stride);
+  } while (i < 9);
+
+  x0 = _mm_loadu_si128((const __m128i *)(above + 9));
+  x0 = _mm_insert_epi16(x0, above[15], 7);
+  const __m128i y = avg3_epu16(&x1, &x2, &x0);
+  _mm_store_si128((__m128i *)dst, y);
+}
+
+void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  __m128i x0 = _mm_load_si128((const __m128i *)above);
+  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
+  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
+
+  d45e_w8(&x0, &x1, &x2, &dst, stride);
+
+  int i = 3;
+  do {
+    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x1, &x2, &x0, &dst, stride);
+
+    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x2, &x0, &x1, &dst, stride);
+
+    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
+    d45e_w8(&x0, &x1, &x2, &dst, stride);
+  } while (i < 15);
+
+  x0 = _mm_loadu_si128((const __m128i *)(above + 15));
+  __m128i y = avg3_epu16(&x1, &x2, &x0);
+  _mm_store_si128((__m128i *)dst, y);
+  dst += stride;
+
+  x1 = _mm_loadu_si128((const __m128i *)(above + 16));
+  y = avg3_epu16(&x2, &x0, &x1);
+  _mm_store_si128((__m128i *)dst, y);
+  dst += stride;
+
+  x2 = _mm_loadu_si128((const __m128i *)(above + 17));
+  x2 = _mm_insert_epi16(x2, above[23], 7);
+  y = avg3_epu16(&x0, &x1, &x2);
+  _mm_store_si128((__m128i *)dst, y);
+}