Add v1_txb_init_levels_avx2

1.optimize v1_txb_init_levels_sse4_1
2.Add v1_txb_init_levels_avx2

speed up about 0.3% without rd change

test sequence: BasketballDrill_832x480_50.y4m

test command line:./aomenc --cpu-used=1 --psnr -D \
 -q --end-usage=vbr --target-bitrate=800 --limit=20 \
 BasketballDrill_832x480_50.y4m -otest.webm

Change-Id: Icb6db0c199c953bca6c8575446053e1565541c0a
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 5da5f0f..7220a40 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -280,6 +280,7 @@
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 675a7cd..988a172 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -252,7 +252,7 @@
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
   specialize qw/av1_get_nz_map_contexts sse2/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
-  specialize qw/av1_txb_init_levels sse4_1/;
+  specialize qw/av1_txb_init_levels sse4_1 avx2/;
 
   add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
   specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
diff --git a/av1/encoder/x86/encodetxb_avx2.c b/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 0000000..7642f57
--- /dev/null
+++ b/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+#include <immintrin.h>  /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m256i y_zeros = _mm256_setzero_si256();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    yy_storeu_256(pre_buf, y_zeros);
+    pre_buf += 32;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+  uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+  do {
+    yy_storeu_256(bottom_buf, y_zeros);
+    bottom_buf += 32;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m256i c0 = yy_loadu_256(cf);
+      const __m256i c1 = yy_loadu_256(cf + 8);
+      const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+      const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+      const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+      const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      ls += 32;
+      cf += 16;
+      i += 4;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      const __m128i res0 = _mm256_castsi256_si128(res);
+      const __m128i res1 = _mm256_extracti128_si256(res, 1);
+      xx_storel_64(ls, res0);
+      *(int32_t *)(ls + width) = 0;
+      xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+      *(int32_t *)(ls + width + stride) = 0;
+      xx_storel_64(ls + stride * 2, res1);
+      *(int32_t *)(ls + width + stride * 2) = 0;
+      xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+      *(int32_t *)(ls + width + stride * 3) = 0;
+      cf += 32;
+      ls += stride << 2;
+      i += 4;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      xx_storeu_128(ls, _mm256_castsi256_si128(res));
+      xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      *(int32_t *)(ls + stride + width) = 0;
+      ls += stride << 1;
+      i += 2;
+    } while (i < height);
+  } else {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/av1/encoder/x86/encodetxb_sse4.c b/av1/encoder/x86/encodetxb_sse4.c
index b3a879b..5e0687c 100644
--- a/av1/encoder/x86/encodetxb_sse4.c
+++ b/av1/encoder/x86/encodetxb_sse4.c
@@ -14,43 +14,55 @@
 #include <smmintrin.h>  /* SSE4.1 */
 
 #include "aom/aom_integer.h"
-#include "aom_dsp/x86/mem_sse2.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
 
 void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
                                 const int height, uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
-  memset(levels - TX_PAD_TOP * stride, 0,
-         sizeof(*levels) * TX_PAD_TOP * stride);
-  memset(levels + stride * height, 0,
-         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
-
   const __m128i zeros = _mm_setzero_si128();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+    pre_buf += 16;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf = levels + stride * height;
+  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+  do {
+    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+    bottom_buf += 16;
+  } while (bottom_buf < bottom_buf_end);
+
   int i = 0;
   uint8_t *ls = levels;
   const tran_low_t *cf = coeff;
   if (width == 4) {
     do {
-      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
       const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
       const __m128i absAB = _mm_abs_epi16(coeffAB);
       const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
       const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
-      _mm_storeu_si128((__m128i *)ls, lsAB);
+      xx_storeu_128(ls, lsAB);
       ls += (stride << 1);
       cf += (width << 1);
       i += 2;
     } while (i < height);
   } else if (width == 8) {
     do {
-      const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-      const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
       const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
       const __m128i absAB = _mm_abs_epi16(coeffAB);
       const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
-      _mm_storeu_si128((__m128i *)ls, absAB8);
+      xx_storeu_128(ls, absAB8);
       ls += stride;
       cf += width;
       i += 1;
@@ -59,16 +71,16 @@
     do {
       int j = 0;
       do {
-        const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
-        const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
-        const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
-        const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+        const __m128i coeffA = xx_loadu_128(cf);
+        const __m128i coeffB = xx_loadu_128(cf + 4);
+        const __m128i coeffC = xx_loadu_128(cf + 8);
+        const __m128i coeffD = xx_loadu_128(cf + 12);
         const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
         const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
         const __m128i absAB = _mm_abs_epi16(coeffAB);
         const __m128i absCD = _mm_abs_epi16(coeffCD);
         const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
-        _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+        xx_storeu_128(ls + j, absABCD);
         j += 16;
         cf += 16;
       } while (j < width);
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index ab6ec72..11cc073 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -181,15 +181,22 @@
                         ::testing::Values(av1_get_nz_map_contexts_sse2));
 #endif
 
-#if HAVE_SSE4_1
-class EncodeTxbInitLevelTest : public ::testing::TestWithParam<int> {
+typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
+                                         const int width, const int height,
+                                         uint8_t *const levels);
+
+typedef ::testing::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+
+class EncodeTxbInitLevelTest
+    : public ::testing::TestWithParam<TxbInitLevelParam> {
  public:
   virtual ~EncodeTxbInitLevelTest() {}
   virtual void TearDown() { libaom_test::ClearSystemState(); }
-  void RunTest(int tx_size, int is_speed);
+  void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
 };
 
-void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) {
+void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
+                                     int tx_size, int is_speed) {
   const int width = get_txb_wide((TX_SIZE)tx_size);
   const int height = get_txb_high((TX_SIZE)tx_size);
   tran_low_t coeff[MAX_TX_SQUARE];
@@ -215,7 +222,7 @@
   const double t1 = get_time_mark(&timer);
   aom_usec_timer_start(&timer);
   for (int i = 0; i < run_times; ++i) {
-    av1_txb_init_levels_sse4_1(coeff, width, height, levels1);
+    test_func(coeff, width, height, levels1);
   }
   const double t2 = get_time_mark(&timer);
   if (is_speed) {
@@ -232,11 +239,24 @@
   }
 }
 
-TEST_P(EncodeTxbInitLevelTest, match) { RunTest(GetParam(), 0); }
-TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { RunTest(GetParam(), 1); }
+TEST_P(EncodeTxbInitLevelTest, match) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 0);
+}
 
-INSTANTIATE_TEST_CASE_P(SSE4_1, EncodeTxbInitLevelTest,
-                        ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1));
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
 #endif
-
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
 }  // namespace