avx2 dr prediction z2 valgrind issue fix

BUG=aomedia:2316

Change-Id: Ic9b6798b637765f148b0120cc7b2661126e0b250
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index a406db9..17f35a0 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -3034,9 +3034,9 @@
   __m256i diff, c3f;
   __m128i a_mbase_x;
 
-  a16 = _mm256_set1_epi32(16);
+  a16 = _mm256_set1_epi16(16);
   a_mbase_x = _mm_set1_epi8(above[max_base_x]);
-  c3f = _mm256_set1_epi32(0x3f);
+  c3f = _mm256_set1_epi16(0x3f);
 
   x = dx;
   for (int r = 0; r < N; r++) {
@@ -3054,31 +3054,32 @@
     if (base_max_diff > 4) base_max_diff = 4;
     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
     a1_128 = _mm_srli_si128(a0_128, 1);
-    a0 = _mm256_cvtepu8_epi32(a0_128);
-    a1 = _mm256_cvtepu8_epi32(a1_128);
 
     if (upsample_above) {
-      a0 = _mm256_permutevar8x32_epi32(
-          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
-      shift = _mm256_srli_epi32(
+      a0_128 = _mm_shuffle_epi8(
+          a0_128,
+          _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15));
+      a1_128 = _mm_srli_si128(a0_128, 4);
+
+      shift = _mm256_srli_epi16(
           _mm256_and_si256(
-              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f),
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
           1);
     } else {
-      shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
     }
+    a0 = _mm256_cvtepu8_epi16(a0_128);
+    a1 = _mm256_cvtepu8_epi16(a1_128);
 
-    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
-    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
-    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
 
-    b = _mm256_mullo_epi32(diff, shift);
-    res = _mm256_add_epi32(a32, b);
-    res = _mm256_srli_epi32(res, 5);
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
 
     res1 = _mm256_castsi256_si128(res);
-    res1 = _mm_packus_epi32(res1, res1);
     res1 = _mm_packus_epi16(res1, res1);
 
     dst[r] =