Speed up dr_prediction_z2_HxW_avx2

profiling against below command:
aomenc park_joy.yuv -w 1920 -h 1080 --i420 -b 8 --cpu-used=3 --cq-level=32 --limit=50 -p 1 -o parkjoy.iv

before the patch:
     1.66%     1.65%  aomenc   aomenc               [.] av1_dr_prediction_z2_avx2
            |
             --1.65%--build_intra_predictors
                       av1_dr_prediction_z2_avx2
with the patch:
     0.59%     0.59%  aomenc   aomenc            [.] av1_dr_prediction_z2_avx2
            |
             --0.58%--build_intra_predictors
                       av1_dr_prediction_z2_avx2

Change-Id: I1c486c5f16b598e7cfc4fa67d1efd6577bdfb793
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 357f69e..1a07e24 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -3569,7 +3569,18 @@
   { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
   { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
 };
-
+/* clang-format off */
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1,  0,  0,  0,  0,  0,  0,  0},
+  { -1, -1,  0,  0,  0,  0,  0,  0},
+  { -1, -1, -1,  0,  0,  0,  0,  0},
+  { -1, -1, -1, -1,  0,  0,  0,  0},
+  { -1, -1, -1, -1, -1,  0,  0,  0},
+  { -1, -1, -1, -1, -1, -1,  0,  0},
+  { -1, -1, -1, -1, -1, -1, -1,  0},
+  { -1, -1, -1, -1, -1, -1, -1, -1},
+};
+/* clang-format on */
 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
     int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
     int dx) {
@@ -4197,27 +4208,51 @@
 
         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
-        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
-        _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
 
-        a0_y = _mm256_setr_epi16(
-            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
-            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
-            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
-            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
-            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
-            left[base_y_c[15]]);
-        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
-        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+        int16_t min_y = _mm256_extract_epi16(base_y_c256, 15);
+        int16_t max_y = _mm256_extract_epi16(base_y_c256, 0);
+        int16_t offset_diff = max_y - min_y;
 
-        a1_y = _mm256_setr_epi16(
-            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
-            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
-            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
-            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
-            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
-            left[base_y_c[15]]);
+        if (offset_diff < 16) {
+          __m256i min_y256 = _mm256_set1_epi16(min_y);
 
+          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+          __m128i base_y_offset128 =
+              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+                              _mm256_extracti128_si256(base_y_offset, 1));
+
+          __m128i a0_y128 = _mm_maskload_epi32(
+              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          __m128i a1_y128 =
+              _mm_maskload_epi32((int *)(left + min_y + 1),
+                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+          a0_y = _mm256_cvtepu8_epi16(a0_y128);
+          a1_y = _mm256_cvtepu8_epi16(a1_y128);
+        } else {
+          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a0_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a1_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+        }
         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
 
         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 128baa2..16ace15 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1377,7 +1377,10 @@
     int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
     if (use_filter_intra) need_bottom = 0;
     if (is_dr_mode) need_bottom = p_angle > 180;
-    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+    // due to the avx2 mask load is with dword granularity.
+    // so we initialize 3 extra bytes to silence valgrind complain.
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];