Speed up convolve_round post-rounding by avx2

- Decoder convolve rounding cycle percentage drops from
  2.75% to 0.91% by using avx2 function on i7-6700.

Change-Id: I34ae48f45c0b4073f8962647d2181365ffe3325b
diff --git a/av1/av1.cmake b/av1/av1.cmake
index c2811ca..f08c700 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -164,6 +164,7 @@
     "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
 
 set(AOM_AV1_COMMON_INTRIN_AVX2
+    "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
     "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
     "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
 
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 95263ab..d727652 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -73,6 +73,7 @@
 AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d_cfg.h
 AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
 AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index a7083cc..00ab3ec 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -656,10 +656,14 @@
 if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
     add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
     specialize qw/av1_convolve_2d sse2/;
+    add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
+    specialize qw/av1_convolve_rounding avx2/;
 
   if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
     specialize qw/av1_highbd_convolve_2d ssse3/;
+    add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
+    specialize qw/av1_highbd_convolve_rounding avx2/;
   }
 }
 
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index b8cc7db..dc375da 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -309,8 +309,8 @@
 }
 
 #if CONFIG_CONVOLVE_ROUND
-void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
-                           int dst_stride, int w, int h, int bits) {
+void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h, int bits) {
   int r, c;
   for (r = 0; r < h; ++r) {
     for (c = 0; c < w; ++c) {
@@ -508,9 +508,9 @@
     for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
 }
 
-void av1_highbd_convolve_rounding(const int32_t *src, int src_stride,
-                                  uint8_t *dst8, int dst_stride, int w, int h,
-                                  int bits, int bd) {
+void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
+                                    uint8_t *dst8, int dst_stride, int w, int h,
+                                    int bits, int bd) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   int r, c;
   for (r = 0; r < h; ++r) {
diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 2df310d..e4197b4 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h
@@ -77,14 +77,7 @@
   return conv_params;
 }
 
-void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
-                           int dst_stride, int w, int h, int bits);
-
 #if CONFIG_HIGHBITDEPTH
-void av1_highbd_convolve_rounding(const int32_t *src, int src_stride,
-                                  uint8_t *dst8, int dst_stride, int w, int h,
-                                  int bits, int bd);
-
 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
                                    uint8_t *dst, int dst_stride, int w, int h,
                                    const InterpFilter *interp_filter,
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
new file mode 100644
index 0000000..a0e5871
--- /dev/null
+++ b/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "./av1_rtcd.h"
+
+#if CONFIG_CONVOLVE_ROUND
+static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+// 16 epi16 pixels
+static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(*u, max);
+  clamped = _mm256_andnot_si256(mask, *u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+
+  const __m256i zero = _mm256_setzero_si256();
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  *u = _mm256_and_si256(clamped, mask);
+}
+
+// 8 epi16 pixels
+static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(*u, max);
+  clamped = _mm_andnot_si128(mask, *u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+
+  const __m128i zero = _mm_setzero_si128();
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  *u = _mm_and_si128(clamped, mask);
+}
+
+// Work on multiple of 32 pixels
+static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
+                                          const __m256i *rnd, int shift,
+                                          int num) {
+  do {
+    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+    x0 = _mm256_add_epi32(x0, *rnd);
+    x1 = _mm256_add_epi32(x1, *rnd);
+    x2 = _mm256_add_epi32(x2, *rnd);
+    x3 = _mm256_add_epi32(x3, *rnd);
+
+    x0 = _mm256_srai_epi32(x0, shift);
+    x1 = _mm256_srai_epi32(x1, shift);
+    x2 = _mm256_srai_epi32(x2, shift);
+    x3 = _mm256_srai_epi32(x3, shift);
+
+    x0 = _mm256_packs_epi32(x0, x1);
+    x2 = _mm256_packs_epi32(x2, x3);
+
+    pixel_clamp_avx2(&x0, 8);
+    pixel_clamp_avx2(&x2, 8);
+
+    x0 = _mm256_packus_epi16(x0, x2);
+    x1 = _mm256_loadu_si256((const __m256i *)sindex);
+    x2 = _mm256_permutevar8x32_epi32(x0, x1);
+
+    _mm256_storeu_si256((__m256i *)dst, x2);
+    src += 32;
+    dst += 32;
+    num--;
+  } while (num > 0);
+}
+
+static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
+                                        const __m256i *rnd, int shift) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x1 = _mm256_add_epi32(x1, *rnd);
+
+  x0 = _mm256_srai_epi32(x0, shift);
+  x1 = _mm256_srai_epi32(x1, shift);
+
+  x0 = _mm256_packs_epi32(x0, x1);
+  pixel_clamp_avx2(&x0, 8);
+
+  const __m256i x2 = _mm256_packus_epi16(x0, x0);
+  x1 = _mm256_loadu_si256((const __m256i *)sindex);
+  x0 = _mm256_permutevar8x32_epi32(x2, x1);
+
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
+                                       const __m256i *rnd, int shift) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x0 = _mm256_srai_epi32(x0, shift);
+
+  x0 = _mm256_packs_epi32(x0, x0);
+  pixel_clamp_avx2(&x0, 8);
+
+  x0 = _mm256_packus_epi16(x0, x0);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
+  x0 = _mm256_permutevar8x32_epi32(x0, x1);
+
+  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
+                                       const __m128i *rnd, int shift) {
+  __m128i x = _mm_loadu_si128((const __m128i *)src);
+  x = _mm_add_epi32(x, *rnd);
+  x = _mm_srai_epi32(x, shift);
+
+  x = _mm_packs_epi32(x, x);
+  pixel_clamp_sse2(&x, 8);
+
+  x = _mm_packus_epi16(x, x);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(x);
+}
+
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                int bits) {
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+  if (w > 64) {  // width = 128
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 32) {  // width = 64
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // width = 32
+    do {
+      cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // width = 16
+    do {
+      cal_rounding_16_avx2(src, dst, &rnd_num, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 4) {  // width = 8
+    do {
+      cal_rounding_8_avx2(src, dst, &rnd_num, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 2) {  // width = 4
+    do {
+      cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else {  // width = 2
+    do {
+      dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
+      dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
+                                                 uint16_t *dst,
+                                                 const __m256i *rnd, int shift,
+                                                 int num, int bd) {
+  do {
+    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+    x0 = _mm256_add_epi32(x0, *rnd);
+    x1 = _mm256_add_epi32(x1, *rnd);
+    x2 = _mm256_add_epi32(x2, *rnd);
+    x3 = _mm256_add_epi32(x3, *rnd);
+
+    x0 = _mm256_srai_epi32(x0, shift);
+    x1 = _mm256_srai_epi32(x1, shift);
+    x2 = _mm256_srai_epi32(x2, shift);
+    x3 = _mm256_srai_epi32(x3, shift);
+
+    x0 = _mm256_packs_epi32(x0, x1);
+    x2 = _mm256_packs_epi32(x2, x3);
+
+    pixel_clamp_avx2(&x0, bd);
+    pixel_clamp_avx2(&x2, bd);
+
+    x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+    x2 = _mm256_permute4x64_epi64(x2, 0xD8);
+
+    _mm256_storeu_si256((__m256i *)dst, x0);
+    _mm256_storeu_si256((__m256i *)(dst + 16), x2);
+    src += 32;
+    dst += 32;
+    num--;
+  } while (num > 0);
+}
+
+static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
+                                               uint16_t *dst,
+                                               const __m256i *rnd, int shift,
+                                               int bd) {
+  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+  x0 = _mm256_add_epi32(x0, *rnd);
+  x1 = _mm256_add_epi32(x1, *rnd);
+
+  x0 = _mm256_srai_epi32(x0, shift);
+  x1 = _mm256_srai_epi32(x1, shift);
+
+  x0 = _mm256_packs_epi32(x0, x1);
+  pixel_clamp_avx2(&x0, bd);
+
+  x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+  _mm256_storeu_si256((__m256i *)dst, x0);
+}
+
+static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
+                                              const __m256i *rnd, int shift,
+                                              int bd) {
+  __m256i x = _mm256_loadu_si256((const __m256i *)src);
+  x = _mm256_add_epi32(x, *rnd);
+  x = _mm256_srai_epi32(x, shift);
+
+  x = _mm256_packs_epi32(x, x);
+  pixel_clamp_avx2(&x, bd);
+
+  x = _mm256_permute4x64_epi64(x, 0xD8);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
+}
+
+static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
+                                              const __m128i *rnd, int shift,
+                                              int bd) {
+  __m128i x = _mm_loadu_si128((const __m128i *)src);
+  x = _mm_add_epi32(x, *rnd);
+  x = _mm_srai_epi32(x, shift);
+
+  x = _mm_packs_epi32(x, x);
+  pixel_clamp_sse2(&x, bd);
+  _mm_storel_epi64((__m128i *)dst, x);
+}
+
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
+                                       uint8_t *dst8, int dst_stride, int w,
+                                       int h, int bits, int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+  const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+  if (w > 64) {  // width = 128
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 32) {  // width = 64
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // width = 32
+    do {
+      cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // width = 16
+    do {
+      cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 4) {  // width = 8
+    do {
+      cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 2) {  // width = 4
+    do {
+      cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else {  // width = 2
+    do {
+      dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
+      dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
+      src += src_stride;
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_CONVOLVE_ROUND
diff --git a/test/convolve_round_test.cc b/test/convolve_round_test.cc
new file mode 100644
index 0000000..8f78a53
--- /dev/null
+++ b/test/convolve_round_test.cc
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+#define CONVOLVE_ROUNDING_PARAM                                            \
+  const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
+      int h, int bits
+
+typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
+
+typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 8;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 10;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 12;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
+
+using std::tr1::tuple;
+
+typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
+    ConvolveRoundParam;
+
+const int kTestNum = 5000;
+
+class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
+ protected:
+  ConvolveRoundTest()
+      : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
+  }
+  virtual ~ConvolveRoundTest() {}
+
+  virtual void SetUp() {
+    const size_t block_size = 128 * 128;
+    src_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, 3 * block_size * sizeof(int32_t)));
+    dst_ref_ = reinterpret_cast<uint16_t *>(src_ + block_size);
+    dst_ = dst_ref_ + block_size;
+  }
+
+  virtual void TearDown() { aom_free(src_); }
+
+  void ConvolveRoundingRun() {
+    int test_num = 0;
+    const int src_stride = 128;
+    const int dst_stride = 128;
+    int bits = 13;
+    uint8_t *dst = 0;
+    uint8_t *dst_ref = 0;
+    int diff_wide;
+
+    if (data_path_ == LOWBITDEPTH_TEST) {
+      dst = reinterpret_cast<uint8_t *>(dst_);
+      dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
+#if CONFIG_HIGHBITDEPTH
+    } else if (data_path_ == HIGHBITDEPTH_TEST) {
+      dst = CONVERT_TO_BYTEPTR(dst_);
+      dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
+#endif
+    } else {
+      assert(0);
+    }
+
+    while (test_num < kTestNum) {
+      int block_size = test_num % BLOCK_SIZES_ALL;
+      int w = block_size_wide[block_size];
+      int h = block_size_high[block_size];
+
+      if (test_num % 2 == 0)
+        bits -= 1;
+      else
+        bits += 1;
+
+      GenerateBufferWithRandom(src_, src_stride, bits, w, h);
+
+      func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
+      func_(src_, src_stride, dst, dst_stride, w, h, bits);
+
+      diff_wide = w;
+      if (data_path_ == LOWBITDEPTH_TEST) diff_wide >>= 1;
+      for (int r = 0; r < h; ++r) {
+        for (int c = 0; c < diff_wide; ++c) {
+          ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
+              << "Mismatch at r: " << r << " c: " << c << " test: " << test_num;
+        }
+      }
+
+      test_num++;
+    }
+  }
+
+  void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
+                                int h) {
+    int32_t number;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        number = static_cast<int32_t>(rand_.Rand31());
+        number %= 1 << (bits + 9);
+        src[r * src_stride + c] = number;
+      }
+    }
+  }
+
+  ACMRandom rand_;
+  int32_t *src_;
+  uint16_t *dst_ref_;
+  uint16_t *dst_;
+
+  ConvolveRoundFunc func_ref_;
+  ConvolveRoundFunc func_;
+  DataPathType data_path_;
+};
+
+TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_AVX2
+const ConvolveRoundParam kConvRndParamArray[] = {
+  make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
+             LOWBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_8<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_8<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_10<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_10<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_12<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveRoundTest,
+                        ::testing::ValuesIn(kConvRndParamArray));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 7ed9167..c0946c1 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -204,8 +204,9 @@
           ${AOM_UNIT_TEST_ENCODER_SOURCES}
           "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
           "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
-          "${AOM_ROOT}/test/av1_convolve_2d_test_util.h")
-    endif ()
+          "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+          "${AOM_ROOT}/test/convolve_round_test.cc")
+      endif ()
 
     if (CONFIG_EXT_INTER)
       set(AOM_UNIT_TEST_ENCODER_SOURCES
diff --git a/test/test.mk b/test/test.mk
index b936c63..05e36c3 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -248,6 +248,7 @@
 LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.h
 LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test.cc
 LIBAOM_TEST_SRCS-$(HAVE_SSE2) += av1_convolve_2d_test_util.cc
+LIBAOM_TEST_SRCS-yes          += convolve_round_test.cc
 endif
 
 ifeq ($(CONFIG_GLOBAL_MOTION)$(CONFIG_AV1_ENCODER),yesyes)