rtc: Add AVX2 variant for functions related to motion search

This CL Adds AVX2 for aom_int_pro_row() and aom_int_pro_col()
functions. Also, refactored the existing code to make it AVX2
friendly.

The overall encode time reduction for RT preset is listed below
       Encode_time
cpu    Reduction(%)
 7       1.150
 8       1.331

Change-Id: Idebe6dd72933674148bcc41785f9b42d93dc2f11
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d672d77..94f15c4 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1188,11 +1188,11 @@
     add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   }
 
-  add_proto qw/void aom_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2 neon/;
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_row avx2 sse2 neon/;
 
-  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/aom_int_pro_col sse2 neon/;
+  add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_col avx2 sse2 neon/;
 
   add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/aom_vector_var sse4_1 neon/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 593807b..2959c97 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -89,58 +89,55 @@
   return horizontal_add_s32x4(accum);
 }
 
-void aom_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref,
-                          const int ref_stride, const int height) {
-  int i;
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
   const uint8_t *idx = ref;
-  uint16x8_t vec0 = vdupq_n_u16(0);
-  uint16x8_t vec1 = vec0;
-  uint8x16_t tmp;
+  const uint16x8_t zero = vdupq_n_u16(0);
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
 
-  for (i = 0; i < height; ++i) {
-    tmp = vld1q_u8(idx);
-    idx += ref_stride;
-    vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
-    vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  for (int wd = 0; wd < width; wd += 16) {
+    uint16x8_t vec0 = zero;
+    uint16x8_t vec1 = zero;
+    idx = ref + wd;
+    for (int ht = 0; ht < height; ++ht) {
+      const uint8x16_t tmp = vld1q_u8(idx);
+      idx += ref_stride;
+      vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
+      vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+    }
+
+    const int16x8_t result0 =
+        vshlq_s16(vreinterpretq_s16_u16(vec0), neg_norm_factor);
+    const int16x8_t result1 =
+        vshlq_s16(vreinterpretq_s16_u16(vec1), neg_norm_factor);
+
+    vst1q_s16(hbuf + wd, result0);
+    vst1q_s16(hbuf + wd + 8, result1);
   }
-
-  if (128 == height) {
-    vec0 = vshrq_n_u16(vec0, 6);
-    vec1 = vshrq_n_u16(vec1, 6);
-  } else if (64 == height) {
-    vec0 = vshrq_n_u16(vec0, 5);
-    vec1 = vshrq_n_u16(vec1, 5);
-  } else if (32 == height) {
-    vec0 = vshrq_n_u16(vec0, 4);
-    vec1 = vshrq_n_u16(vec1, 4);
-  } else if (16 == height) {
-    vec0 = vshrq_n_u16(vec0, 3);
-    vec1 = vshrq_n_u16(vec1, 3);
-  }
-
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec0));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec1));
 }
 
-int16_t aom_int_pro_col_neon(const uint8_t *ref, const int width) {
-  const uint8_t *idx;
-  uint16x8_t sum = vdupq_n_u16(0);
-
-  for (idx = ref; idx < (ref + width); idx += 16) {
-    uint8x16_t vec = vld1q_u8(idx);
-    sum = vaddq_u16(sum, vpaddlq_u8(vec));
-  }
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  for (int ht = 0; ht < height; ++ht) {
+    uint16x8_t sum = vdupq_n_u16(0);
+    for (int wd = 0; wd < width; wd += 16) {
+      const uint8x16_t vec = vld1q_u8(ref + wd);
+      sum = vaddq_u16(sum, vpaddlq_u8(vec));
+    }
 
 #if defined(__aarch64__)
-  return (int16_t)vaddvq_u16(sum);
+    vbuf[ht] = ((int16_t)vaddvq_u16(sum)) >> norm_factor;
 #else
-  const uint32x4_t a = vpaddlq_u16(sum);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return (int16_t)vget_lane_u32(c, 0);
+    const uint32x4_t a = vpaddlq_u16(sum);
+    const uint64x2_t b = vpaddlq_u32(a);
+    const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                  vreinterpret_u32_u64(vget_high_u64(b)));
+    vbuf[ht] = ((int16_t)vget_lane_u32(c, 0)) >> norm_factor;
 #endif
+    ref += ref_stride;
+  }
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index a3821e6..920f08d 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -507,29 +507,29 @@
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64, 128}.
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
-                       const int ref_stride, const int height) {
-  int idx;
-  const int norm_factor = height >> 1;
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
   assert(height >= 2);
-  for (idx = 0; idx < 16; ++idx) {
-    int i;
+  for (int idx = 0; idx < width; ++idx) {
     hbuf[idx] = 0;
     // hbuf[idx]: 14 bit, dynamic range [0, 32640].
-    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
     // hbuf[idx]: 9 bit, dynamic range [0, 1020].
-    hbuf[idx] /= norm_factor;
+    hbuf[idx] >>= norm_factor;
     ++ref;
   }
 }
 
 // width: value range {16, 32, 64, 128}.
-int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
-  int idx;
-  int16_t sum = 0;
-  // sum: 14 bit, dynamic range [0, 32640]
-  for (idx = 0; idx < width; ++idx) sum += ref[idx];
-  return sum;
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
+  for (int ht = 0; ht < height; ++ht) {
+    int16_t sum = 0;
+    // sum: 14 bit, dynamic range [0, 32640]
+    for (int idx = 0; idx < width; ++idx) sum += ref[idx];
+    vbuf[ht] = sum >> norm_factor;
+    ref += ref_stride;
+  }
 }
 
 // ref: [0 - 510]
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6c8db3a..9bfc06b 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -552,3 +552,120 @@
   avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
   avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
 }
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
+
+  if (width % 32 == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int wd = 0; wd < width; wd += 32) {
+      const uint8_t *ref_tmp = ref + wd;
+      int16_t *hbuf_tmp = hbuf + wd;
+      __m256i s0 = zero;
+      __m256i s1 = zero;
+      int idx = 0;
+      do {
+        __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+        __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_adds_epu16(s0, t0);
+        s1 = _mm256_adds_epu16(s1, t1);
+        ref_tmp += ref_stride;
+
+        src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        t0 = _mm256_unpacklo_epi8(src_line, zero);
+        t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_adds_epu16(s0, t0);
+        s1 = _mm256_adds_epu16(s1, t1);
+        ref_tmp += ref_stride;
+        idx += 2;
+      } while (idx < height);
+      s0 = _mm256_srai_epi16(s0, norm_factor);
+      s1 = _mm256_srai_epi16(s1, norm_factor);
+      _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+                       _mm256_extractf128_si256(s0, 1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+                       _mm256_extractf128_si256(s1, 1));
+    }
+  } else if (width % 16 == 0) {
+    aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+  }
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width to be multiple of 16. For any odd width,
+  // SIMD support needs to be added.
+  assert(width % 16 == 0);
+
+  if (width == 128) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ++ht) {
+      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+      const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(ref + 64));
+      const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(ref + 96));
+      const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
+      const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+      const __m256i s2 = _mm256_sad_epu8(src_line2, zero);
+      const __m256i s3 = _mm256_sad_epu8(src_line3, zero);
+      const __m256i result0_256bit = _mm256_adds_epu16(s0, s1);
+      const __m256i result1_256bit = _mm256_adds_epu16(s2, s3);
+      const __m256i result_256bit =
+          _mm256_adds_epu16(result0_256bit, result1_256bit);
+
+      const __m128i result =
+          _mm_adds_epu16(_mm256_castsi256_si128(result_256bit),
+                         _mm256_extractf128_si256(result_256bit, 1));
+      __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8));
+      vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor;
+      ref += ref_stride;
+    }
+  } else if (width == 64) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ++ht) {
+      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+      const __m256i s1 = _mm256_sad_epu8(src_line0, zero);
+      const __m256i s2 = _mm256_sad_epu8(src_line1, zero);
+      const __m256i result_256bit = _mm256_adds_epu16(s1, s2);
+
+      const __m128i result =
+          _mm_adds_epu16(_mm256_castsi256_si128(result_256bit),
+                         _mm256_extractf128_si256(result_256bit, 1));
+      __m128i result1 = _mm_adds_epu16(result, _mm_srli_si128(result, 8));
+      vbuf[ht] = _mm_extract_epi16(result1, 0) >> norm_factor;
+      ref += ref_stride;
+    }
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 2) {
+      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i src_line1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
+      const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+
+      __m128i result0 = _mm_adds_epu16(_mm256_castsi256_si128(s0),
+                                       _mm256_extractf128_si256(s0, 1));
+      __m128i result1 = _mm_adds_epu16(_mm256_castsi256_si128(s1),
+                                       _mm256_extractf128_si256(s1, 1));
+      __m128i result2 = _mm_adds_epu16(result0, _mm_srli_si128(result0, 8));
+      __m128i result3 = _mm_adds_epu16(result1, _mm_srli_si128(result1, 8));
+
+      vbuf[ht] = _mm_extract_epi16(result2, 0) >> norm_factor;
+      vbuf[ht + 1] = _mm_extract_epi16(result3, 0) >> norm_factor;
+      ref += (2 * ref_stride);
+    }
+  } else if (width == 16) {
+    aom_int_pro_col_sse2(vbuf, ref, ref_stride, width, height, norm_factor);
+  }
+}
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index bdbd1f6..8e89555 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -625,74 +625,64 @@
   return _mm_cvtsi128_si32(accum);
 }
 
-void aom_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
-                          const int ref_stride, const int height) {
-  int idx = 1;
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-  do {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
 
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-    idx += 2;
-  } while (idx < height_1);
+  for (int wd = 0; wd < width; wd += 16) {
+    const uint8_t *ref_tmp = ref + wd;
+    int16_t *hbuf_tmp = hbuf + wd;
+    __m128i s0 = zero;
+    __m128i s1 = zero;
+    int idx = 0;
+    do {
+      __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+      __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_adds_epu16(s0, t0);
+      s1 = _mm_adds_epu16(s1, t1);
+      ref_tmp += ref_stride;
 
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-  if (height == 128) {
-    s0 = _mm_srai_epi16(s0, 6);
-    s1 = _mm_srai_epi16(s1, 6);
-  } else if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    assert(height == 16);
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      t0 = _mm_unpacklo_epi8(src_line, zero);
+      t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_adds_epu16(s0, t0);
+      s1 = _mm_adds_epu16(s1, t1);
+      ref_tmp += ref_stride;
+      idx += 2;
+    } while (idx < height);
+
+    s0 = _mm_srai_epi16(s0, norm_factor);
+    s1 = _mm_srai_epi16(s1, norm_factor);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
   }
-
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
-int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width to be multiple of 16.
+  assert(width % 16 == 0);
 
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
+  for (int ht = 0; ht < height; ht++) {
+    const uint8_t *ref_tmp = ref + (ht * ref_stride);
+    __m128i zero = _mm_setzero_si128();
+    __m128i s0 = zero;
+    __m128i s1, src_line;
+    for (int i = 0; i < width; i += 16) {
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      s1 = _mm_sad_epu8(src_line, zero);
+      s0 = _mm_adds_epu16(s0, s1);
+      ref_tmp += 16;
+    }
+
+    s1 = _mm_srli_si128(s0, 8);
     s0 = _mm_adds_epu16(s0, s1);
+    vbuf[ht] = _mm_extract_epi16(s0, 0) >> norm_factor;
   }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
 }
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 395e350..ff48fc3 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1946,7 +1946,8 @@
   uint8_t const *ref_buf, *src_buf;
   int_mv *best_int_mv = &xd->mi[0]->mv[0];
   unsigned int best_sad, tmp_sad, this_sad[4];
-  const int norm_factor = 3 + (bw >> 5);
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
   static const MV search_pos[4] = {
@@ -1981,28 +1982,16 @@
 
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, search_width, bh, row_norm_factor);
 
   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
-    ref_buf += ref_stride;
-  }
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, search_height,
+                  col_norm_factor);
 
   // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
   src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
-    src_buf += src_stride;
-  }
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 93f4c34..bcbf1b9 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,7 @@
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 128;
 
   virtual void SetUp() {
     const testing::TestInfo *const test_info =
@@ -343,20 +343,32 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
 
 class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
         hbuf_c_(nullptr) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
+  }
+
+  void set_norm_factor() {
+    if (height_ == 128)
+      norm_factor_ = 6;
+    else if (height_ == 64)
+      norm_factor_ = 5;
+    else if (height_ == 32)
+      norm_factor_ = 4;
+    else if (height_ == 16)
+      norm_factor_ = 3;
   }
 
  protected:
@@ -366,10 +378,10 @@
     ASSERT_NE(source_data_, nullptr);
 
     hbuf_asm_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
     ASSERT_NE(hbuf_asm_, nullptr);
     hbuf_c_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
     ASSERT_NE(hbuf_c_, nullptr);
   }
 
@@ -383,19 +395,24 @@
   }
 
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    set_norm_factor();
+    API_REGISTER_STATE_CHECK(
+        c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Height = %d number of iteration is %d \n", height_, numIter);
+    set_norm_factor();
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      c_func_(hbuf_c_, source_data_, 0, height_);
+      c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -403,7 +420,7 @@
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      asm_func_(hbuf_asm_, source_data_, 0, height_);
+      asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -415,7 +432,7 @@
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
         << "Output mismatch\n";
   }
 
@@ -424,35 +441,68 @@
   IntProRowFunc c_func_;
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
+  int norm_factor_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+                              const int ref_stride, const int width,
+                              const int height, int norm_factor);
 
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
 
 class IntProColTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProColParam> {
  public:
-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
+  IntProColTest()
+      : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+        vbuf_c_(nullptr) {
+    asm_func_ = GET_PARAM(2);
+    c_func_ = GET_PARAM(3);
   }
 
  protected:
+  virtual void SetUp() {
+    source_data_ = static_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_NE(source_data_, nullptr);
+
+    vbuf_asm_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+    ASSERT_NE(vbuf_asm_, nullptr);
+    vbuf_c_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+    ASSERT_NE(vbuf_c_, nullptr);
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = nullptr;
+    aom_free(vbuf_c_);
+    vbuf_c_ = nullptr;
+    aom_free(vbuf_asm_);
+    vbuf_asm_ = nullptr;
+  }
+
   void RunComparison() {
-    API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+    int norm_factor_ = 3 + (width_ >> 5);
+    API_REGISTER_STATE_CHECK(
+        c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+    API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+                                       height_, norm_factor_));
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
   void RunSpeedTest() {
     const int numIter = 5000000;
-    printf("Width = %d number of iteration is %d \n", width_, numIter);
+    printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+           numIter);
+    int norm_factor_ = 3 + (width_ >> 5);
     aom_usec_timer c_timer_;
     aom_usec_timer_start(&c_timer_);
     for (int i = 0; i < numIter; i++) {
-      sum_c_ = c_func_(source_data_, width_);
+      c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&c_timer_);
 
@@ -460,7 +510,7 @@
     aom_usec_timer_start(&asm_timer_);
 
     for (int i = 0; i < numIter; i++) {
-      sum_asm_ = asm_func_(source_data_, width_);
+      asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
     }
     aom_usec_timer_mark(&asm_timer_);
 
@@ -472,14 +522,15 @@
            asm_sum_time,
            (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
 
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
+    EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+        << "Output mismatch\n";
   }
 
  private:
   IntProColFunc asm_func_;
   IntProColFunc c_func_;
-  int16_t sum_asm_;
-  int16_t sum_c_;
+  int16_t *vbuf_asm_;
+  int16_t *vbuf_c_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
 
@@ -703,19 +754,19 @@
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_sse2,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_sse2,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_AVX2
@@ -724,6 +775,22 @@
     ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
                       make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProRowTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IntProColTest,
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
 #endif
 
 #if HAVE_NEON
@@ -737,19 +804,19 @@
                       make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_neon,
-                                 &aom_int_pro_row_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+        make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_neon,
-                                 &aom_int_pro_col_c)));
+    ::testing::Values(
+        make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+        make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, AvgTest8bpp_avg_8x8_quad,