Add AVX2 for mhccp_implicit_fetch_neighbor

This patch includes the following changes.

1. Implementation of AVX2 for mhccp_implicit_fetch_neighbor luma and chroma.
2. Unit test for bit matching between the mhccp_implicit_fetch_neighbor C function and the AVX2 intrinsic function.

The scaling of AVX2 implementation w.r.t. to C are below:
| BW     | Scaling Y | Scaling UV |
|--------|-----------|------------|
|   4    |    2.34x  |    3.28x   |
|   8    |    3.96x  |    5.01x   |
|   16   |    7.08x  |    5.99x   |
|   32   |    9.08x  |   11.43x   |
|   64   |   10.19x  |    NA      |


This is a bit-exact change.

This change provides 0.5% decode time reduction for Key frame for A2 testset tested on commit 249b80e8.
diff --git a/av2/common/av2_rtcd_defs.pl b/av2/common/av2_rtcd_defs.pl
index c1ead36..08f4115 100644
--- a/av2/common/av2_rtcd_defs.pl
+++ b/av2/common/av2_rtcd_defs.pl
@@ -452,6 +452,12 @@
 add_proto qw/av2_cfl_subtract_average_fn av2_cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
 specialize qw/av2_cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
 
+add_proto qw/void av2_mhccp_implicit_fetch_neighbor_chroma/, "const uint16_t *dst, int input_stride, TX_SIZE tx_size, int above_lines, int left_lines, int is_top_sb_boundary, int ref_width, int ref_height, uint16_t *output_q3";
+specialize qw/av2_mhccp_implicit_fetch_neighbor_chroma avx2/;
+
+add_proto qw/void av2_mhccp_implicit_fetch_neighbor_luma_420/, "const uint16_t *input, int input_stride, int above_lines, int left_lines, int is_top_sb_boundary, int ref_width, int ref_height, int sub_y, uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3, int output_stride";
+specialize qw/av2_mhccp_implicit_fetch_neighbor_luma_420 avx2/;
+
 add_proto qw/av2_cfl_subsample_hbd_fn av2_cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
 specialize qw/av2_cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
 
diff --git a/av2/common/reconintra.c b/av2/common/reconintra.c
index e729709..be6528d 100644
--- a/av2/common/reconintra.c
+++ b/av2/common/reconintra.c
@@ -1826,6 +1826,82 @@
   }
 }
 
+// Fetch neighboring luma samples for multi hypothesis cross component
+// prediction for 4:2:0 case.
+void av2_mhccp_implicit_fetch_neighbor_luma_420_c(
+    const uint16_t *input, int input_stride, int above_lines, int left_lines,
+    int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+    uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+    int output_stride) {
+  for (int h = 0; h < ref_height; h += 2) {
+    for (int w = 0; w < ref_width; w += 2) {
+      const int bot = w + input_stride;
+      if ((h >= above_lines && w >= left_lines + width) ||
+          (h >= above_lines + height && w >= left_lines))
+        continue;
+      // For blocks at the superblock top boundary, we only have one line
+      // above available, therefore we need to offset values for above
+      // region Proposal E229 (for 4:2:0 case) propose to use only 4 lines
+      // and 2 padding lines for the luma reference region, and 2 lines and
+      // 1 padding line for the chroma reference region. Therefore, for
+      // these 2 padding lines above and to the left, we need to offset the
+      // reference region for the top and left boundaries ref_h_t_off is the
+      // position offset value of the top pixel in cfl_ds_filter_index == 2
+      // for both padding and superblock top boundary.
+      int ref_h_t_off = 0;
+      // ref_h_c_off is the position offset value of the pixel in the same
+      // horizontal line of the center pixel for both padding and superblock
+      // top boundary.
+      int ref_h_c_off = 0;
+      // ref_h_b_off is the position offset value of the bottom pixel in
+      // downsample filtering for both padding and superblock top boundary.
+      int ref_h_b_off = 0;
+      // ref_w_off is the position offset value for padding only in the left
+      // and right directions.
+      int ref_w_off = 0;
+      // Preparing the vertical position offset values for superblock top
+      // boundary and padding.
+      if (above_lines == ((LINE_NUM + 1) << sub_y)) {
+        if (is_top_sb_boundary && (h < above_lines)) {
+          // For the top boundary of the superblock, we need to offset the
+          // reference region.
+          ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
+                               : ((LINE_NUM + 1) << sub_y) - (h + 1);
+          ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
+          ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
+        }
+      }
+      if (cfl_ds_filter_index == 1) {
+        output_q3[w >> 1] =
+            input[AVMMAX(0, w - 1) + ref_w_off + ref_h_c_off * input_stride] +
+            2 * input[w + ref_w_off + ref_h_c_off * input_stride] +
+            input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+            input[bot + AVMMAX(-1, -w) + ref_w_off +
+                  ref_h_b_off * input_stride] +
+            2 * input[bot + ref_w_off + ref_h_b_off * input_stride] +
+            input[bot + 1 + ref_w_off + ref_h_b_off * input_stride];
+      } else if (cfl_ds_filter_index == 2) {
+        const int top = h != 0 ? w - input_stride : w;
+        output_q3[w >> 1] =
+            input[AVMMAX(0, w - 1) + ref_w_off + ref_h_c_off * input_stride] +
+            4 * input[w + ref_w_off + ref_h_c_off * input_stride] +
+            input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+            input[top + ref_w_off + ref_h_t_off * input_stride] +
+            input[bot + ref_w_off + ref_h_b_off * input_stride];
+      } else {
+        output_q3[w >> 1] =
+            (input[w + ref_w_off + ref_h_c_off * input_stride] +
+             input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
+             input[bot + ref_w_off + ref_h_b_off * input_stride] +
+             input[bot + 1 + ref_w_off + ref_h_b_off * input_stride])
+            << 1;
+      }
+    }
+    output_q3 += output_stride;
+    input += (input_stride << 1);
+  }
+}
+
 void mhccp_implicit_fetch_neighbor_luma(const AV2_COMMON *cm,
                                         MACROBLOCKD *const xd, int row, int col,
                                         TX_SIZE tx_size, int *above_lines,
@@ -1950,76 +2026,10 @@
   input = input - (*above_lines) * input_stride - *left_lines;
   if ((*above_lines) || (*left_lines)) {
     if (sub_x && sub_y) {
-      for (int h = 0; h < (*ref_height); h += 2) {
-        for (int w = 0; w < (*ref_width); w += 2) {
-          const int bot = w + input_stride;
-          if ((h >= *above_lines && w >= *left_lines + width) ||
-              (h >= *above_lines + height && w >= *left_lines))
-            continue;
-          // For blocks at the superblock top boundary, we only have one line
-          // above available, therefore we need to offset values for above
-          // region Proposal E229 (for 4:2:0 case) propose to use only 4 lines
-          // and 2 padding lines for the luma reference region, and 2 lines and
-          // 1 padding line for the chroma reference region. Therefore, for
-          // these 2 padding lines above and to the left, we need to offset the
-          // reference region for the top and left boundaries ref_h_t_off is the
-          // position offset value of the top pixel in cfl_ds_filter_index == 2
-          // for both padding and superblock top boundary
-          int ref_h_t_off = 0;
-          // ref_h_c_off is the position offset value of the pixel in the same
-          // horizontal line of the center pixel for both padding and superblock
-          // top boundary
-          int ref_h_c_off = 0;
-          // ref_h_b_off is the position offset value of the bottom pixel in
-          // downsample filtering for both padding and superblock top boundary
-          int ref_h_b_off = 0;
-          // ref_w_off is the position offset value for padding only in the left
-          // and right directions
-          int ref_w_off = 0;
-          // Preparing the vertical position offset values for superblock top
-          // boundary and padding
-          if (*above_lines == ((LINE_NUM + 1) << sub_y)) {
-            if (is_top_sb_boundary && (h < *above_lines)) {
-              // For the top boundary of the superblock, we need to offset the
-              // reference region
-              ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
-                                   : ((LINE_NUM + 1) << sub_y) - (h + 1);
-              ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
-              ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
-            }
-          }
-          if (cm->seq_params.cfl_ds_filter_index == 1) {
-            output_q3[w >> 1] =
-                input[AVMMAX(0, w - 1) + ref_w_off +
-                      ref_h_c_off * input_stride] +
-                2 * input[w + ref_w_off + ref_h_c_off * input_stride] +
-                input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
-                input[bot + AVMMAX(-1, -w) + ref_w_off +
-                      ref_h_b_off * input_stride] +
-                2 * input[bot + ref_w_off + ref_h_b_off * input_stride] +
-                input[bot + 1 + ref_w_off + ref_h_b_off * input_stride];
-          } else if (cm->seq_params.cfl_ds_filter_index == 2) {
-            const int top = h != 0 ? w - input_stride : w;
-            output_q3[w >> 1] =
-                input[AVMMAX(0, w - 1) + ref_w_off +
-                      ref_h_c_off * input_stride] +
-                4 * input[w + ref_w_off + ref_h_c_off * input_stride] +
-                input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
-                input[top + ref_w_off + ref_h_t_off * input_stride] +
-                input[bot + ref_w_off + ref_h_b_off * input_stride];
-          } else {
-            output_q3[w >> 1] =
-                (input[w + ref_w_off + ref_h_c_off * input_stride] +
-                 input[w + 1 + ref_w_off + ref_h_c_off * input_stride] +
-                 input[bot + ref_w_off + ref_h_b_off * input_stride] +
-                 input[bot + 1 + ref_w_off + ref_h_b_off * input_stride])
-                << 1;
-          }
-        }
-        output_q3 += output_stride;
-        input += (input_stride << 1);
-      }
-
+      av2_mhccp_implicit_fetch_neighbor_luma_420(
+          input, input_stride, *above_lines, *left_lines, is_top_sb_boundary,
+          *ref_width, *ref_height, sub_y, cm->seq_params.cfl_ds_filter_index,
+          width, height, output_q3, output_stride);
     } else if (sub_x) {
       for (int h = 0; h < (*ref_height); h++) {
         for (int i = 0; i < (*ref_width); i += 2) {
@@ -2096,22 +2106,16 @@
   }
 }
 
-void mhccp_implicit_fetch_neighbor_chroma(MACROBLOCKD *const xd, int plane,
-                                          int row, int col, TX_SIZE tx_size,
-                                          int above_lines, int left_lines,
-                                          int is_top_sb_boundary, int ref_width,
-                                          int ref_height) {
-  CFL_CTX *const cfl = &xd->cfl;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int input_stride = pd->dst.stride;
-  uint16_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
-
+// Fetch neighboring chroma samples for multi hypothesis cross component
+// prediction.
+void av2_mhccp_implicit_fetch_neighbor_chroma_c(
+    const uint16_t *input, int input_stride, TX_SIZE tx_size, int above_lines,
+    int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+    uint16_t *output_q3) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
 
-  uint16_t *output_q3 = cfl->mhccp_ref_buf_q3[plane];
   int output_stride = CFL_BUF_LINE * 2;
-  uint16_t *input = dst - above_lines * input_stride - left_lines;
   if (above_lines || left_lines) {
     for (int h = 0; h < ref_height; ++h) {
       for (int w = 0; w < ref_width; ++w) {
@@ -2238,9 +2242,12 @@
         left_lines >>= sub_x;
         ref_width >>= sub_x;
         ref_height >>= sub_y;
-        mhccp_implicit_fetch_neighbor_chroma(
-            xd, plane, blk_row, blk_col, tx_size, above_lines, left_lines,
-            is_top_sb_boundary, ref_width, ref_height);
+        uint16_t *output_q3 = cfl->mhccp_ref_buf_q3[plane];
+        uint16_t *input = dst - above_lines * dst_stride - left_lines;
+
+        av2_mhccp_implicit_fetch_neighbor_chroma(
+            input, dst_stride, tx_size, above_lines, left_lines,
+            is_top_sb_boundary, ref_width, ref_height, output_q3);
         av2_mhccp_derive_multi_param_hv(xd, plane, above_lines, left_lines,
                                         ref_width, ref_height, mbmi->mh_dir,
                                         is_top_sb_boundary);
diff --git a/av2/common/reconintra.h b/av2/common/reconintra.h
index 1205efe..4c03af1 100644
--- a/av2/common/reconintra.h
+++ b/av2/common/reconintra.h
@@ -263,13 +263,6 @@
                                         TX_SIZE tx_size, int *above_lines,
                                         int *left_lines, int is_top_sb_boundary,
                                         int *ref_width, int *ref_height);
-// fetch neighboring chroma samples for multi hypothesis cross component
-// prediction
-void mhccp_implicit_fetch_neighbor_chroma(MACROBLOCKD *const xd, int plane,
-                                          int row, int col, TX_SIZE tx_size,
-                                          int above_lines, int left_lines,
-                                          int is_top_sb_boundary, int ref_width,
-                                          int ref_height);
 
 static AVM_INLINE void set_have_top_and_left(int *have_top, int *have_left,
                                              const MACROBLOCKD *xd, int row_off,
diff --git a/av2/common/x86/cfl_avx2.c b/av2/common/x86/cfl_avx2.c
index 5835dda..b97fea5 100644
--- a/av2/common/x86/cfl_avx2.c
+++ b/av2/common/x86/cfl_avx2.c
@@ -1823,3 +1823,211 @@
         1 << MHCCP_DECIM_BITS;
   }
 }
+
+// AVX2 implementation for av2_mhccp_implicit_fetch_neighbor_chroma_c
+void av2_mhccp_implicit_fetch_neighbor_chroma_avx2(
+    const uint16_t *input, int input_stride, TX_SIZE tx_size, int above_lines,
+    int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+    uint16_t *output_q3) {
+  (void)is_top_sb_boundary;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+
+  int output_stride = CFL_BUF_LINE * 2;
+  if (above_lines || left_lines) {
+    for (int h = 0; h < ref_height; ++h) {
+      assert(LINE_NUM == 1);
+      int ref_h_offset = (above_lines == (LINE_NUM + 1) && h == 0) ? 1 : 0;
+      int h_offset_stride = ref_h_offset * input_stride;
+      int w = 0;
+      for (; w < ref_width; w += 16) {
+        if ((h >= above_lines && w >= left_lines + width) ||
+            (h >= above_lines + height && w >= left_lines))
+          continue;
+        __m256i l_vec =
+            _mm256_loadu_si256((const __m256i *)(input + h_offset_stride + w));
+        if (left_lines == (LINE_NUM + 1) && (w == 0)) {
+          __m128i lo = _mm256_castsi256_si128(l_vec);
+          lo = _mm_shufflelo_epi16(lo, _MM_SHUFFLE(3, 2, 1, 1));
+          l_vec = _mm256_inserti128_si256(l_vec, lo, 0);
+        }
+        _mm256_storeu_si256((__m256i *)(output_q3 + w), l_vec);
+      }
+      output_q3 += output_stride;
+      input += input_stride;
+    }
+  }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_index[32]) = { 0,  1,  4, 5,  8,  9, 12, 13, 0,  1, 4,
+                                       5,  8,  9, 12, 13, 0, 1,  4,  5,  8, 9,
+                                       12, 13, 0, 1,  4,  5, 8,  9,  12, 13 };
+DECLARE_ALIGNED(32, static const uint16_t,
+                blend_index[16]) = { 0xFFFF, 0, 0, 0, 0, 0, 0, 0,
+                                     0xFFFF, 0, 0, 0, 0, 0, 0, 0 };
+
+// AVX2 implementation for av2_mhccp_implicit_fetch_neighbor_luma_420_c
+void av2_mhccp_implicit_fetch_neighbor_luma_420_avx2(
+    const uint16_t *input, int input_stride, int above_lines, int left_lines,
+    int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+    uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+    int output_stride) {
+  for (int h = 0; h < ref_height; h += 2) {
+    int ref_h_t_off = 0;
+    int ref_h_c_off = 0;
+    int ref_h_b_off = 0;
+    int ref_w_off = 0;
+    if (above_lines == ((LINE_NUM + 1) << sub_y)) {
+      if (is_top_sb_boundary && (h < above_lines)) {
+        // For the top boundary of the superblock, we need to offset the
+        // reference region
+        ref_h_t_off = h != 0 ? ((LINE_NUM + 1) << sub_y) - h
+                             : ((LINE_NUM + 1) << sub_y) - (h + 1);
+        ref_h_c_off = ((LINE_NUM + 1) << sub_y) - (h + 1);
+        ref_h_b_off = ((LINE_NUM + 1) << sub_y) - (h + 2);
+      }
+    }
+    int h_c_offset_stride = ref_h_c_off * input_stride;
+    int h_b_offset_stride = ref_h_b_off * input_stride + input_stride;
+    for (int w = 0; w < ref_width; w += 32) {
+      if ((h >= above_lines && w >= left_lines + width) ||
+          (h >= above_lines + height && w >= left_lines))
+        continue;
+
+      // h_c0, h_c1, ... h_c15
+      __m256i h_c0 =
+          _mm256_loadu_si256((const __m256i *)(input + h_c_offset_stride + w));
+
+      // h_c16, h_c17, ... h_c31
+      __m256i h_c16 = _mm256_loadu_si256(
+          (const __m256i *)(input + h_c_offset_stride + w + 16));
+
+      // h_b0, h_b1, ..... h_b15
+      __m256i h_b0 =
+          _mm256_loadu_si256((const __m256i *)(input + h_b_offset_stride + w));
+
+      // h_b16, h_b17, ..... h_b31
+      __m256i h_b16 = _mm256_loadu_si256(
+          (const __m256i *)(input + h_b_offset_stride + w + 16));
+
+      if (cfl_ds_filter_index == 1 || cfl_ds_filter_index == 2) {
+        __m256i blend_mask = _mm256_load_si256((__m256i *)blend_index);
+        __m256i h_cm1;
+        if (w > 0) {
+          h_cm1 = _mm256_loadu_si256(
+              (const __m256i *)(input + h_c_offset_stride + ref_w_off + w - 1));
+        } else {
+          // h_c7, h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c15, h_c8, h_c9,
+          // h_c10, h_c11, h_c12, h_c13, h_c14
+          __m256i w_reorder1 = _mm256_alignr_epi8(h_c0, h_c0, 14);
+          // h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c7, h_c7, h_c0, h_c1,
+          // h_c2, h_c3, h_c4, h_c5, h_c6
+          __m256i w_reorder2 =
+              _mm256_permute2x128_si256(h_c0, w_reorder1, 0x20);
+          // h_c0, h_c0, h_c1, h_c2, h_c3, h_c4, h_c5, h_c6, h_c7, h_c8, h_c9,
+          // h_c10, h_c11, h_c12, h_c13, h_c14
+          h_cm1 = _mm256_blendv_epi8(w_reorder1, w_reorder2, blend_mask);
+        }
+
+        // h_c15, h_c16, ... h_c30
+        __m256i h_c15 = _mm256_loadu_si256(
+            (const __m256i *)(input + h_c_offset_stride + w + 15));
+
+        if (cfl_ds_filter_index == 1) {
+          __m256i h_bm1;
+          if (w > 0) {
+            h_bm1 =
+                _mm256_loadu_si256((const __m256i *)(input + h_b_offset_stride +
+                                                     ref_w_off + w - 1));
+          } else {
+            // h_b7, h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b15, h_b8,
+            // h_b9, h_b10, h_b11, h_b12, h_b13, h_b14
+            __m256i b_reorder1 = _mm256_alignr_epi8(h_b0, h_b0, 14);
+            // h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b7, h_b7, h_b0, h_b1,
+            // h_b2, h_b3, h_b4, h_b5, h_b6
+            __m256i b_reorder2 =
+                _mm256_permute2x128_si256(h_b0, b_reorder1, 0x20);
+
+            // h_b0, h_b0, h_b1, h_b2, h_b3, h_b4, h_b5, h_b6, h_b7, h_b8, h_b9,
+            // h_b10, h_b11, h_b12, h_b13, h_b14
+            h_bm1 = _mm256_blendv_epi8(b_reorder1, b_reorder2, blend_mask);
+          }
+
+          // h_b15, h_b16, ..... h_b30
+          __m256i h_b15 = _mm256_loadu_si256(
+              (const __m256i *)(input + h_b_offset_stride + w + 15));
+
+          __m256i h_cm1_c0 = _mm256_add_epi16(h_cm1, h_c0);
+          __m256i h_bm1_b0 = _mm256_add_epi16(h_bm1, h_b0);
+          __m256i h_c15_c16 = _mm256_add_epi16(h_c15, h_c16);
+          __m256i h_b15_b16 = _mm256_add_epi16(h_b15, h_b16);
+
+          __m256i h_cm1_c0_bm1_b0 = _mm256_add_epi16(h_cm1_c0, h_bm1_b0);
+          __m256i h_c15_c16_b15_b16 = _mm256_add_epi16(h_c15_c16, h_b15_b16);
+
+          __m256i res = _mm256_hadd_epi16(h_cm1_c0_bm1_b0, h_c15_c16_b15_b16);
+
+          res = _mm256_permute4x64_epi64(res, 0xD8);
+
+          _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+        } else if (cfl_ds_filter_index == 2) {
+          const int top = h != 0 ? (-input_stride) : 0;
+          int h_t_offset_stride = top + ref_h_t_off * input_stride;
+
+          __m256i h_t0 = _mm256_loadu_si256(
+              (const __m256i *)(input + h_t_offset_stride + w));
+          __m256i h_t16 = _mm256_loadu_si256(
+              (const __m256i *)(input + h_t_offset_stride + w + 16));
+
+          const __m256i shuffle_mask =
+              _mm256_load_si256((__m256i *)shuffle_index);
+          // h_c0, h_c2, h_c4, ....
+          __m256i h_c0_even = _mm256_shuffle_epi8(h_c0, shuffle_mask);
+          // h_b0, h_b2, h_b4, ....
+          __m256i h_b0_even = _mm256_shuffle_epi8(h_b0, shuffle_mask);
+          __m256i b0_c0_even = _mm256_unpacklo_epi16(h_b0_even, h_c0_even);
+          // h_t0, h_t2, h_t4, ....
+          __m256i h_t0_even = _mm256_shuffle_epi8(h_t0, shuffle_mask);
+          __m256i t0_c0_even = _mm256_unpacklo_epi16(h_t0_even, h_c0_even);
+
+          // h_c16, h_c18, h_c20, ....
+          __m256i h_c16_even = _mm256_shuffle_epi8(h_c16, shuffle_mask);
+          // h_b16, h_b18, h_b20, ....
+          __m256i h_b16_even = _mm256_shuffle_epi8(h_b16, shuffle_mask);
+          __m256i b16_c16_even = _mm256_unpacklo_epi16(h_b16_even, h_c16_even);
+          // h_t16, h_t18, h_t20, ....
+          __m256i h_t16_even = _mm256_shuffle_epi8(h_t16, shuffle_mask);
+          __m256i t16_c16_even = _mm256_unpacklo_epi16(h_t16_even, h_c16_even);
+
+          __m256i h_cm1_c0 = _mm256_add_epi16(h_cm1, h_c0);
+          __m256i bc0_tc0_even = _mm256_add_epi16(b0_c0_even, t0_c0_even);
+          __m256i h_c15_c16 = _mm256_add_epi16(h_c15, h_c16);
+          __m256i bc16_tc16_even = _mm256_add_epi16(b16_c16_even, t16_c16_even);
+
+          __m256i h_cm1_c0_bc0_tc0_even =
+              _mm256_add_epi16(h_cm1_c0, bc0_tc0_even);
+          __m256i h_c15_c16_bc16_tc16_even =
+              _mm256_add_epi16(h_c15_c16, bc16_tc16_even);
+
+          __m256i res = _mm256_hadd_epi16(h_cm1_c0_bc0_tc0_even,
+                                          h_c15_c16_bc16_tc16_even);
+
+          res = _mm256_permute4x64_epi64(res, 0xD8);
+          _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+        }
+      } else {
+        __m256i h_c0_b0 = _mm256_add_epi16(h_c0, h_b0);
+        __m256i h_c16_b16 = _mm256_add_epi16(h_c16, h_b16);
+
+        __m256i res = _mm256_hadd_epi16(h_c0_b0, h_c16_b16);
+        res = _mm256_permute4x64_epi64(res, 0xD8);
+        res = _mm256_slli_epi16(res, 1);
+
+        _mm256_storeu_si256((__m256i *)(output_q3 + (w >> 1)), res);
+      }
+    }
+    output_q3 += output_stride;
+    input += (input_stride << 1);
+  }
+}
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 452291d..11ea695 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -883,4 +883,283 @@
 INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest,
                          ::testing::ValuesIn(sub_avg_sizes_vsx));
 #endif
+
+#if HAVE_AVX2
+std::array<std::array<int, 2>, 3> chroma_above_left_line_pairs = {
+  { { 0, 2 }, { 2, 0 }, { 2, 2 } }
+};
+
+std::array<std::array<int, 2>, 3> luma_above_left_line_pairs = {
+  { { 0, 4 }, { 4, 0 }, { 4, 4 } }
+};
+#endif  // HAVE_AVX2
+
+const int chroma_tx_size[] = { 0, 1,  2,  3,  5,  6,  7,  8,
+                               9, 10, 13, 14, 15, 16, 19, 20 };
+
+typedef void (*av2_mhccp_implicit_fetch_neighbor_chroma_fn)(
+    const uint16_t *dst, int input_stride, TX_SIZE tx_size, int above_lines,
+    int left_lines, int is_top_sb_boundary, int ref_width, int ref_height,
+    uint16_t *output_q3);
+
+typedef std::tuple<std::array<int, 2>, int, int,
+                   av2_mhccp_implicit_fetch_neighbor_chroma_fn>
+    mhccp_derive_param_chroma;
+
+class MhccpImplicitFetchNeighbourChromaTest
+    : public ::testing::TestWithParam<mhccp_derive_param_chroma> {
+ public:
+  void SetUp() override {
+    std::array<int, 2> arrlin = std::get<0>(GetParam());
+    above_lines_ = arrlin[0];
+    left_lines_ = arrlin[1];
+    tx_size_ = std::get<1>(GetParam());
+    bd_ = std::get<2>(GetParam());
+
+    tgt_fn_ = std::get<3>(GetParam());
+    ref_fn_ = av2_mhccp_implicit_fetch_neighbor_chroma_c;
+
+    ref_width_ = AVMMIN((tx_size_wide[tx_size_] + left_lines_), 128);
+    ref_height_ = AVMMIN((tx_size_high[tx_size_] + above_lines_), 128);
+
+    initData();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  av2_mhccp_implicit_fetch_neighbor_chroma_fn tgt_fn_;
+  av2_mhccp_implicit_fetch_neighbor_chroma_fn ref_fn_;
+  int above_lines_, left_lines_, ref_width_, ref_height_, bd_;
+  TX_SIZE tx_size_;
+  uint16_t *output_q3_ref_, *output_q3_tgt_, *input_;
+  ACMRandom rnd_;
+
+  void initData() {
+    int num_w, num_h;
+    num_w = num_h = (CFL_BUF_LINE << 1);
+    const uint16_t mask = (1 << bd_) - 1;
+    input_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+    for (int i = 0; i < num_h; ++i) {
+      for (int j = 0; j < num_w; ++j) {
+        uint16_t val = this->rnd_.Rand16() & mask;
+        input_[i * num_w + j] = val;
+      }
+    }
+    output_q3_ref_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+    output_q3_tgt_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+  }
+
+  void TearDown() override {
+    free(input_);
+    free(output_q3_ref_);
+    free(output_q3_tgt_);
+  }
+
+  void assertMhccpChromaEqParams(int above_l, int left_l, int width, int height,
+                                 int ref_w, int ref_h, int stride) {
+    uint16_t *ref_ptr = output_q3_ref_;
+    uint16_t *tgt_ptr = output_q3_tgt_;
+    for (int i = 0; i < ref_h; ++i) {
+      for (int j = 0; j < ref_w; ++j) {
+        if ((i >= above_l && j >= left_l + width) ||
+            (i >= above_l + height && j >= left_l))
+          continue;
+        const uint16_t ref_val = ref_ptr[i * stride + j];
+        const uint16_t tgt_val = tgt_ptr[i * stride + j];
+        ASSERT_EQ(ref_val, tgt_val)
+            << "Mismatch at index[" << (i * stride + j) << "] ref=" << ref_val
+            << " tgt=" << tgt_val;
+      }
+    }
+  }
+};
+
+TEST_P(MhccpImplicitFetchNeighbourChromaTest, CompareCAndAVX2) {
+  const int is_top_sb_boundary = rnd_(2);
+  int dst_stride_ = CFL_BUF_LINE << 1;
+  ref_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+          is_top_sb_boundary, ref_width_, ref_height_, output_q3_ref_);
+
+  tgt_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+          is_top_sb_boundary, ref_width_, ref_height_, output_q3_tgt_);
+
+  assertMhccpChromaEqParams(above_lines_, left_lines_, tx_size_wide[tx_size_],
+                            tx_size_high[tx_size_], ref_width_, ref_height_,
+                            dst_stride_);
+}
+
+TEST_P(MhccpImplicitFetchNeighbourChromaTest, DISABLED_SpeedTest) {
+  const int is_top_sb_boundary = 0;
+  avm_usec_timer ref_timer, tgt_timer;
+  int dst_stride_ = CFL_BUF_LINE << 1;
+
+  avm_usec_timer_start(&ref_timer);
+  for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+    ref_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+            is_top_sb_boundary, ref_width_, ref_height_, output_q3_ref_);
+  }
+  avm_usec_timer_mark(&ref_timer);
+  const int ref_time = (int)avm_usec_timer_elapsed(&ref_timer);
+
+  avm_usec_timer_start(&tgt_timer);
+  for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+    tgt_fn_(input_, dst_stride_, tx_size_, above_lines_, left_lines_,
+            is_top_sb_boundary, ref_width_, ref_height_, output_q3_tgt_);
+  }
+  avm_usec_timer_mark(&tgt_timer);
+  const int tgt_time = (int)avm_usec_timer_elapsed(&tgt_timer);
+
+  printSpeed(ref_time, tgt_time, tx_size_wide[tx_size_],
+             tx_size_high[tx_size_]);
+  assertFaster(ref_time, tgt_time);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MhccpImplicitFetchNeighbourChromaTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(chroma_above_left_line_pairs),
+        ::testing::ValuesIn(chroma_tx_size),  // tx_size
+        ::testing::Values(8, 10, 12),         // bd
+        ::testing::Values(av2_mhccp_implicit_fetch_neighbor_chroma_avx2)));
+#endif  // HAVE_AVX2
+
+typedef void (*av2_mhccp_implicit_fetch_neighbor_luma_420_fn)(
+    const uint16_t *input, int input_stride, int above_lines, int left_lines,
+    int is_top_sb_boundary, int ref_width, int ref_height, int sub_y,
+    uint8_t cfl_ds_filter_index, int width, int height, uint16_t *output_q3,
+    int output_stride);
+
+typedef std::tuple<std::array<int, 2>, int, int, int, int,
+                   av2_mhccp_implicit_fetch_neighbor_luma_420_fn>
+    mhccp_derive_param_luma;
+
+class MhccpImplicitFetchNeighbourLumaTest
+    : public ::testing::TestWithParam<mhccp_derive_param_luma> {
+ public:
+  void SetUp() override {
+    std::array<int, 2> arrlin = std::get<0>(GetParam());
+    above_lines_ = arrlin[0];
+    left_lines_ = arrlin[1];
+    cfl_ds_filter_index_ = std::get<1>(GetParam());
+    width_ = std::get<2>(GetParam()) << 1;
+    height_ = std::get<3>(GetParam()) << 1;
+    bd_ = std::get<4>(GetParam());
+
+    tgt_fn_ = std::get<5>(GetParam());
+    ref_fn_ = av2_mhccp_implicit_fetch_neighbor_luma_420_c;
+
+    initData();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  av2_mhccp_implicit_fetch_neighbor_luma_420_fn tgt_fn_;
+  av2_mhccp_implicit_fetch_neighbor_luma_420_fn ref_fn_;
+  int above_lines_, left_lines_, ref_width_, ref_height_, bd_;
+  int sub_y_, input_stride_, output_stride_;
+  int width_, height_;
+  uint16_t *output_q3_ref_, *output_q3_tgt_, *input_;
+  uint8_t cfl_ds_filter_index_;
+  ACMRandom rnd_;
+
+  void initData() {
+    int num_w, num_h;
+    sub_y_ = 1;
+    input_stride_ = output_stride_ = (CFL_BUF_LINE << 1);
+    ref_width_ = AVMMIN(left_lines_ + width_, 128);
+    ref_height_ = AVMMIN(above_lines_ + height_, 128);
+    num_w = num_h = (CFL_BUF_LINE << 1);
+    const uint16_t mask = (1 << bd_) - 1;
+    input_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+    for (int i = 0; i < num_h; ++i) {
+      for (int j = 0; j < num_w; ++j) {
+        uint16_t val = this->rnd_.Rand16() & mask;
+        input_[i * num_w + j] = val;
+      }
+    }
+    output_q3_ref_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+    output_q3_tgt_ = (uint16_t *)calloc(1, sizeof(uint16_t) * (num_h * num_w));
+  }
+
+  void TearDown() override {
+    free(input_);
+    free(output_q3_ref_);
+    free(output_q3_tgt_);
+  }
+
+  void assertMhccpLumaEqParams(int above_l, int left_l, int width, int height,
+                               int ref_w, int ref_h) {
+    uint16_t *ref_ptr = output_q3_ref_;
+    uint16_t *tgt_ptr = output_q3_tgt_;
+    for (int i = 0; i < ref_h; i += 2) {
+      for (int j = 0; j < ref_w; j += 2) {
+        if ((i >= above_l && j >= left_l + width) ||
+            (i >= above_l + height && j >= left_l))
+          continue;
+        const uint16_t ref_val = ref_ptr[(i >> 1) * output_stride_ + (j >> 1)];
+        const uint16_t tgt_val = tgt_ptr[(i >> 1) * output_stride_ + (j >> 1)];
+        ASSERT_EQ(ref_val, tgt_val)
+            << "Mismatch at index[" << ((i >> 1) * output_stride_ + (j >> 1))
+            << "] ref=" << ref_val << " tgt=" << tgt_val;
+      }
+    }
+  }
+};
+
+TEST_P(MhccpImplicitFetchNeighbourLumaTest, CompareCAndAVX2) {
+  const int is_top_sb_boundary = rnd_(2);
+  ref_fn_(input_, input_stride_, above_lines_, left_lines_, is_top_sb_boundary,
+          ref_width_, ref_height_, sub_y_, cfl_ds_filter_index_, width_,
+          height_, output_q3_ref_, output_stride_);
+
+  tgt_fn_(input_, input_stride_, above_lines_, left_lines_, is_top_sb_boundary,
+          ref_width_, ref_height_, sub_y_, cfl_ds_filter_index_, width_,
+          height_, output_q3_tgt_, output_stride_);
+
+  assertMhccpLumaEqParams(above_lines_, left_lines_, width_, height_,
+                          ref_width_, ref_height_);
+}
+
+TEST_P(MhccpImplicitFetchNeighbourLumaTest, DISABLED_SpeedTest) {
+  const int is_top_sb_boundary = 0;
+  avm_usec_timer ref_timer, tgt_timer;
+  avm_usec_timer_start(&ref_timer);
+  for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+    ref_fn_(input_, input_stride_, above_lines_, left_lines_,
+            is_top_sb_boundary, ref_width_, ref_height_, sub_y_,
+            cfl_ds_filter_index_, width_, height_, output_q3_ref_,
+            output_stride_);
+  }
+  avm_usec_timer_mark(&ref_timer);
+  const int ref_time = (int)avm_usec_timer_elapsed(&ref_timer);
+
+  avm_usec_timer_start(&tgt_timer);
+  for (int i = 0; i < NUM_ITERATIONS_SPEED; ++i) {
+    tgt_fn_(input_, input_stride_, above_lines_, left_lines_,
+            is_top_sb_boundary, ref_width_, ref_height_, sub_y_,
+            cfl_ds_filter_index_, width_, height_, output_q3_tgt_,
+            output_stride_);
+  }
+  avm_usec_timer_mark(&tgt_timer);
+  const int tgt_time = (int)avm_usec_timer_elapsed(&tgt_timer);
+
+  printSpeed(ref_time, tgt_time, (width_ >> 1), (height_ >> 1));
+  assertFaster(ref_time, tgt_time);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, MhccpImplicitFetchNeighbourLumaTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(luma_above_left_line_pairs),
+        ::testing::Values(0, 1, 2),           // cfl_ds_filter_index_
+        ::testing::Values(4, 8, 16, 32, 64),  // width_
+        ::testing::Values(4, 8, 16, 32, 64),  // height_
+        ::testing::Values(8, 10, 12),         // bd
+        ::testing::Values(av2_mhccp_implicit_fetch_neighbor_luma_420_avx2)));
+#endif  // HAVE_AVX2
+
 }  // namespace