Faster AVX2 X SR convolve function for smaller block widths.

Added specialized code for blocks widths less than 16 samples
in the function av1_convolve_x_sr_avx2. About 1.5x faster
in speed test on blocks narrower than 16 samples.

Change-Id: I54f795db707557cafee1c5ac40c03d90bcadf8f3
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 56046c7..83fd382 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -754,13 +754,14 @@
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
   assert(conv_params->round_0 > 0);
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
-      // 19 20 21 22 23
-      const __m256i data = _mm256_inserti128_si256(
-          _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 1);
+  if (w <= 8) {
+    for (i = 0; i < h; i += 2) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+          0x20);
 
       __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
 
@@ -774,22 +775,48 @@
       // 8 bit conversion and saturation to uint8
       __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
 
-      // Store values into the destination buffer
-      if (w - j > 8) {
+      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+      if (w > 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+      } else if (w > 2) {
+        xx_storel_32(&dst[i * dst_stride], res_0);
+        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+      } else {
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+        // 19 20 21 22 23
+        const __m256i data = _mm256_inserti128_si256(
+            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+            1);
+
+        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        // Store values into the destination buffer
         // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
         res_8b = _mm256_permute4x64_epi64(res_8b, 216);
         __m128i res = _mm256_castsi256_si128(res_8b);
         _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-      } else {
-        __m128i res = _mm256_castsi256_si128(res_8b);
-        if (w - j > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
-        } else if (w - j > 2) {
-          xx_storel_32(&dst[i * dst_stride + j], res);
-        } else {
-          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-          *(uint16_t *)p = _mm_cvtsi128_si32(res);
-        }
       }
     }
   }