aom_dsp/x86/sad_highbd_avx2.c - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <immintrin.h>

 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"

 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"

 // SAD
 static AOM_FORCE_INLINE unsigned int get_sad_from_mm256_epi32(
     const __m256i *v) {
   // input 8 32-bit summation
   __m128i lo128, hi128;
   __m256i u = _mm256_srli_si256(*v, 8);
   u = _mm256_add_epi32(u, *v);

   // 4 32-bit summation
   hi128 = _mm256_extracti128_si256(u, 1);
   lo128 = _mm256_castsi256_si128(u);
   lo128 = _mm_add_epi32(hi128, lo128);

   // 2 32-bit summation
   hi128 = _mm_srli_si128(lo128, 4);
   lo128 = _mm_add_epi32(lo128, hi128);

   return (unsigned int)_mm_cvtsi128_si32(lo128);
 }

 static AOM_FORCE_INLINE void highbd_sad16x4_core_ds_avx2(__m256i *s, __m256i *r,
                                                          __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   for (i = 0; i < 2; i++) {
     s[i] = _mm256_sub_epi16(s[i], r[i]);
     s[i] = _mm256_abs_epi16(s[i]);
   }

   s[0] = _mm256_add_epi16(s[0], s[1]);

   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
                                                       __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   __m256i res[4];
   for (i = 0; i < 4; i++) {
     res[i] = _mm256_sub_epi16(s[i], r[i]);
     res[i] = _mm256_abs_epi16(res[i]);
   }

   res[0] = _mm256_add_epi16(res[0], res[1]);
   res[0] = _mm256_add_epi16(res[0], res[2]);
   res[0] = _mm256_add_epi16(res[0], res[3]);

   r[0] = _mm256_unpacklo_epi16(res[0], zero);
   r[1] = _mm256_unpackhi_epi16(res[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void highbd_sad16x2_core_avx2(__m256i *s, __m256i *r,
                                                       __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   __m256i res[2];
   for (i = 0; i < 2; i++) {
     res[i] = _mm256_sub_epi16(s[i], r[i]);
     res[i] = _mm256_abs_epi16(res[i]);
   }

   res[0] = _mm256_add_epi16(res[0], res[1]);

   r[0] = _mm256_unpacklo_epi16(res[0], zero);
   r[1] = _mm256_unpackhi_epi16(res[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void highbd_sad20x4_core_ds_avx2(__m256i *s, __m256i *r,
                                                          __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   for (i = 0; i < 3; i++) {
     s[i] = _mm256_sub_epi16(s[i], r[i]);
     s[i] = _mm256_abs_epi16(s[i]);
   }

   s[0] = _mm256_add_epi16(s[0], s[1]);
   s[0] = _mm256_add_epi16(s[0], s[2]);

   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void highbd_sad12x4_core_ds_avx2(__m256i *s, __m256i *r,
                                                          __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   for (i = 0; i < 2; i++) {
     s[i] = _mm256_sub_epi16(s[i], r[i]);
     s[i] = _mm256_abs_epi16(s[i]);
   }

   s[0] = _mm256_add_epi16(s[0], s[1]);

   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }
 static AOM_FORCE_INLINE void highbd_sad20x4_core_avx2(__m256i *s, __m256i *r,
                                                       __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   for (i = 0; i < 5; i++) {
     s[i] = _mm256_sub_epi16(s[i], r[i]);
     s[i] = _mm256_abs_epi16(s[i]);
   }

   s[0] = _mm256_add_epi16(s[0], s[1]);
   s[0] = _mm256_add_epi16(s[0], s[2]);
   s[0] = _mm256_add_epi16(s[0], s[3]);
   s[0] = _mm256_add_epi16(s[0], s[4]);

   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void highbd_sad12x4_core_avx2(__m256i *s, __m256i *r,
                                                       __m256i *sad_acc) {
   const __m256i zero = _mm256_setzero_si256();
   int i;
   for (i = 0; i < 3; i++) {
     s[i] = _mm256_sub_epi16(s[i], r[i]);
     s[i] = _mm256_abs_epi16(s[i]);
   }

   s[0] = _mm256_add_epi16(s[0], s[1]);
   s[0] = _mm256_add_epi16(s[0], s[2]);

   r[0] = _mm256_unpacklo_epi16(s[0], zero);
   r[1] = _mm256_unpackhi_epi16(s[0], zero);

   r[0] = _mm256_add_epi32(r[0], r[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }

 static AOM_FORCE_INLINE void sad16x4_ds(const uint16_t *src_ptr, int src_stride,
                                         const uint16_t *ref_ptr, int ref_stride,
                                         __m256i *sad_acc) {
   __m256i s[2], r[2];
   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));

   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));

   highbd_sad16x4_core_ds_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad8x4_ds(const uint16_t *src_ptr, int src_stride,
                                        const uint16_t *ref_ptr, int ref_stride,
                                        __m256i *sad_acc) {
   __m128i tmp[2];
   __m256i tmp2[2];
   __m256i s, r;
   tmp[0] = _mm_loadu_si128((const __m128i *)src_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
   s = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp[0]), tmp[1], 1);

   tmp[0] = _mm_loadu_si128((const __m128i *)ref_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
   r = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp[0]), tmp[1], 1);

   const __m256i zero = _mm256_setzero_si256();
   s = _mm256_sub_epi16(s, r);
   s = _mm256_abs_epi16(s);

   tmp2[0] = _mm256_unpacklo_epi16(s, zero);
   tmp2[1] = _mm256_unpackhi_epi16(s, zero);

   tmp2[0] = _mm256_add_epi32(tmp2[0], tmp2[1]);
   *sad_acc = _mm256_add_epi32(*sad_acc, tmp2[0]);
 }

 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
 static AOM_FORCE_INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      const uint16_t *sec_ptr,
                                      __m256i *sad_acc) {
   __m256i s[4], r[4];
   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
   s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
   s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));

   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
   r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
   r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));

   if (sec_ptr) {
     r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
     r[1] = _mm256_avg_epu16(
         r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
     r[2] = _mm256_avg_epu16(
         r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
     r[3] = _mm256_avg_epu16(
         r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
   }
   highbd_sad16x4_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad16x2(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      const uint16_t *sec_ptr,
                                      __m256i *sad_acc) {
   __m256i s[2], r[2];
   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));

   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));

   if (sec_ptr) {
     r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
     r[1] = _mm256_avg_epu16(
         r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
   }
   highbd_sad16x4_core_ds_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad16x4_4d(__m256i *s, const uint16_t *ref_ptr,
                                         int ref_stride, __m256i *sad_acc) {
   __m256i r[4];
   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
   r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
   r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));

   highbd_sad16x4_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad16x2_4d(__m256i *s, const uint16_t *ref_ptr,
                                         int ref_stride, __m256i *sad_acc) {
   __m256i r[2];
   const __m128i a = _mm_loadu_si128((const __m128i *)ref_ptr);
   const __m128i b = _mm_loadu_si128((const __m128i *)(ref_ptr + ref_stride));
   const __m128i c =
       _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
   const __m128i d =
       _mm_loadu_si128((const __m128i *)(ref_ptr + 3 * ref_stride));

   r[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1);
   r[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(c), d, 0x1);

   highbd_sad16x2_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad20x4_ds(const uint16_t *src_ptr, int src_stride,
                                         const uint16_t *ref_ptr, int ref_stride,
                                         __m256i *sad_acc) {
   __m256i s[3], r[3];
   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
   const __m256i s_1 =
       _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(&src_ptr[16])));
   const __m128i s_2 =
       _mm_loadl_epi64((const __m128i *)(&src_ptr[16 + 2 * src_stride]));
   s[2] = _mm256_inserti128_si256(s_1, s_2, 1);
   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
   const __m256i r_1 =
       _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(&ref_ptr[16])));
   const __m128i r_2 =
       _mm_loadl_epi64((const __m128i *)(&ref_ptr[16 + 2 * ref_stride]));
   r[2] = _mm256_inserti128_si256(r_1, r_2, 1);
   highbd_sad20x4_core_ds_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad12x4_ds(const uint16_t *src_ptr, int src_stride,
                                         const uint16_t *ref_ptr, int ref_stride,
                                         __m256i *sad_acc) {
   __m128i tmp[2];
   __m256i s[2], r[2];
   tmp[0] = _mm_loadu_si128((const __m128i *)src_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
   s[0] = _mm256_castsi128_si256(tmp[0]);
   s[0] = _mm256_insertf128_si256(s[0], tmp[1], 1);
   const __m256i s_1 =
       _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(&src_ptr[8])));
   const __m128i s_2 =
       _mm_loadl_epi64((const __m128i *)(&src_ptr[8 + 2 * src_stride]));
   s[1] = _mm256_inserti128_si256(s_1, s_2, 1);

   tmp[0] = _mm_loadu_si128((const __m128i *)ref_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
   r[0] = _mm256_castsi128_si256(tmp[0]);
   r[0] = _mm256_insertf128_si256(r[0], tmp[1], 1);
   const __m256i r_1 =
       _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(&ref_ptr[8])));
   const __m128i r_2 =
       _mm_loadl_epi64((const __m128i *)(&ref_ptr[8 + 2 * ref_stride]));
   r[1] = _mm256_inserti128_si256(r_1, r_2, 1);

   highbd_sad12x4_core_ds_avx2(s, r, sad_acc);
 }
 static AOM_FORCE_INLINE void sad20x4(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      __m256i *sad_acc) {
   __m256i s[5], r[5];
   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
   s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
   s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
   s[4] = _mm256_set_epi16(
       src_ptr[16], src_ptr[16 + 1], src_ptr[16 + 2], src_ptr[16 + 3],
       src_ptr[16 + src_stride], src_ptr[16 + src_stride + 1],
       src_ptr[16 + src_stride + 2], src_ptr[16 + src_stride + 3],
       src_ptr[16 + 2 * src_stride], src_ptr[16 + 2 * src_stride + 1],
       src_ptr[16 + 2 * src_stride + 2], src_ptr[16 + 2 * src_stride + 3],
       src_ptr[16 + 3 * src_stride], src_ptr[16 + 3 * src_stride + 1],
       src_ptr[16 + 3 * src_stride + 2], src_ptr[16 + 3 * src_stride + 3]);

   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
   r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
   r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
   r[4] = _mm256_set_epi16(
       ref_ptr[16], ref_ptr[16 + 1], ref_ptr[16 + 2], ref_ptr[16 + 3],
       ref_ptr[16 + ref_stride], ref_ptr[16 + ref_stride + 1],
       ref_ptr[16 + ref_stride + 2], ref_ptr[16 + ref_stride + 3],
       ref_ptr[16 + 2 * ref_stride], ref_ptr[16 + 2 * ref_stride + 1],
       ref_ptr[16 + 2 * ref_stride + 2], ref_ptr[16 + 2 * ref_stride + 3],
       ref_ptr[16 + 3 * ref_stride], ref_ptr[16 + 3 * ref_stride + 1],
       ref_ptr[16 + 3 * ref_stride + 2], ref_ptr[16 + 3 * ref_stride + 3]);

   highbd_sad20x4_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad12x4(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      __m256i *sad_acc) {
   __m128i tmp[2];
   __m256i s[3], r[3];
   tmp[0] = _mm_loadu_si128((const __m128i *)src_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
   s[0] = _mm256_castsi128_si256(tmp[0]);
   s[0] = _mm256_insertf128_si256(s[0], tmp[1], 1);

   tmp[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
   tmp[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride));
   s[1] = _mm256_castsi128_si256(tmp[0]);
   s[1] = _mm256_insertf128_si256(s[1], tmp[1], 1);
   s[2] = _mm256_set_epi16(
       src_ptr[8], src_ptr[8 + 1], src_ptr[8 + 2], src_ptr[8 + 3],
       src_ptr[8 + src_stride], src_ptr[8 + src_stride + 1],
       src_ptr[8 + src_stride + 2], src_ptr[8 + src_stride + 3],
       src_ptr[8 + 2 * src_stride], src_ptr[8 + 2 * src_stride + 1],
       src_ptr[8 + 2 * src_stride + 2], src_ptr[8 + 2 * src_stride + 3],
       src_ptr[8 + 3 * src_stride], src_ptr[8 + 3 * src_stride + 1],
       src_ptr[8 + 3 * src_stride + 2], src_ptr[8 + 3 * src_stride + 3]);

   tmp[0] = _mm_loadu_si128((const __m128i *)ref_ptr);
   tmp[1] = _mm_loadu_si128((const __m128i *)(ref_ptr + ref_stride));
   r[0] = _mm256_castsi128_si256(tmp[0]);
   r[0] = _mm256_insertf128_si256(r[0], tmp[1], 1);

   tmp[0] = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
   tmp[1] = _mm_loadu_si128((const __m128i *)(ref_ptr + 3 * ref_stride));
   r[1] = _mm256_castsi128_si256(tmp[0]);
   r[1] = _mm256_insertf128_si256(r[1], tmp[1], 1);
   r[2] = _mm256_set_epi16(
       ref_ptr[8], ref_ptr[8 + 1], ref_ptr[8 + 2], ref_ptr[8 + 3],
       ref_ptr[8 + ref_stride], ref_ptr[8 + ref_stride + 1],
       ref_ptr[8 + ref_stride + 2], ref_ptr[8 + ref_stride + 3],
       ref_ptr[8 + 2 * ref_stride], ref_ptr[8 + 2 * ref_stride + 1],
       ref_ptr[8 + 2 * ref_stride + 2], ref_ptr[8 + 2 * ref_stride + 3],
       ref_ptr[8 + 3 * ref_stride], ref_ptr[8 + 3 * ref_stride + 1],
       ref_ptr[8 + 3 * ref_stride + 2], ref_ptr[8 + 3 * ref_stride + 3]);

   highbd_sad12x4_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_2rows_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   for (int i = 0; i < N; i += 2) {
     sad16x2(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
     src_ptr += src_stride << 1;
     ref_ptr += ref_stride << 1;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_ds_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad16x4_ds(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad8xN_ds_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad8x4_ds(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad20xN_ds_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad20x4_ds(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad12xN_ds_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad12x4_ds(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad20xN_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad20x4(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad12xN_avx2(
     int N, const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
     int ref_stride) {
   int i;
   __m256i sad = _mm256_setzero_si256();
   for (i = 0; i < N; i += 4) {
     sad12x4(src_ptr, src_stride, ref_ptr, ref_stride, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
   return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE void sad32x4(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      const uint16_t *sec_ptr,
                                      __m256i *sad_acc) {
   __m256i s[4], r[4];
   int row_sections = 0;

   while (row_sections < 2) {
     s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
     s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
     s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
     s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));

     r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
     r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
     r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
     r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));

     if (sec_ptr) {
       r[0] =
           _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
       r[1] = _mm256_avg_epu16(
           r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
       r[2] = _mm256_avg_epu16(
           r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
       r[3] = _mm256_avg_epu16(
           r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
       sec_ptr += 32 << 1;
     }
     highbd_sad16x4_core_avx2(s, r, sad_acc);

     row_sections += 1;
     src_ptr += src_stride << 1;
     ref_ptr += ref_stride << 1;
   }
 }

 static AOM_FORCE_INLINE void sad32x2(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      const uint16_t *sec_ptr,
                                      __m256i *sad_acc) {
   __m256i s[4], r[4];

   s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
   s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
   s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
   s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));

   r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
   r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
   r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
   r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));

   if (sec_ptr) {
     r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
     r[1] = _mm256_avg_epu16(
         r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
     r[2] = _mm256_avg_epu16(
         r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
     r[3] = _mm256_avg_epu16(
         r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
   }
   highbd_sad16x4_core_avx2(s, r, sad_acc);
 }

 static AOM_FORCE_INLINE void sad32x4_4d(__m256i *s, const uint16_t *ref_ptr,
                                         int ref_stride, __m256i *sad_acc) {
   __m256i r[4];
   int row_sections = 0;

   while (row_sections < 2) {
     r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
     r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
     r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
     r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));

     highbd_sad16x4_core_avx2(s + 4 * row_sections, r, sad_acc);
     row_sections += 1;
     ref_ptr += ref_stride << 1;
   }
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(
     int N, const uint16_t *src, int src_stride, const uint16_t *ref,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 2;
   int i;

   for (i = 0; i < N; i += 4) {
     sad32x4(src, src_stride, ref, ref_stride, NULL, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_2rows_avx2(
     int N, const uint16_t *src, int src_stride, const uint16_t *ref,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 1;

   for (int i = 0; i < N; i += 2) {
     sad32x2(src, src_stride, ref, ref_stride, NULL, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE void sad64x2(const uint16_t *src_ptr, int src_stride,
                                      const uint16_t *ref_ptr, int ref_stride,
                                      const uint16_t *sec_ptr,
                                      __m256i *sad_acc) {
   __m256i s[4], r[4];
   int i;
   for (i = 0; i < 2; i++) {
     s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
     s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
     s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
     s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));

     r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
     r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
     r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
     r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
     if (sec_ptr) {
       r[0] =
           _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
       r[1] = _mm256_avg_epu16(
           r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
       r[2] = _mm256_avg_epu16(
           r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
       r[3] = _mm256_avg_epu16(
           r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
       sec_ptr += 64;
     }
     highbd_sad16x4_core_avx2(s, r, sad_acc);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
   }
 }

 static AOM_FORCE_INLINE void sad64x2_4d(__m256i *s, const uint16_t *ref_ptr,
                                         int ref_stride, __m256i *sad_acc) {
   __m256i r[4];
   int i;
   for (i = 0; i < 2; i++) {
     r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
     r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
     r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
     r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));

     highbd_sad16x4_core_avx2(s + 4 * i, r, sad_acc);
     ref_ptr += ref_stride;
   }
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(
     int N, const uint16_t *src, int src_stride, const uint16_t *ref,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 1;
   int i;
   for (i = 0; i < N; i += 2) {
     sad64x2(src, src_stride, ref, ref_stride, NULL, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 #define MAKE_SAD_WX1(w)                                                        \
   static AOM_FORCE_INLINE void sad##w##x1(                                     \
       const uint16_t *src_ptr, const uint16_t *ref_ptr,                        \
       const uint16_t *sec_ptr, __m256i *sad_acc) {                             \
     assert(w % 64 == 0);                                                       \
     __m256i s[4], r[4];                                                        \
     int i;                                                                     \
     for (i = 0; i < w / 64; i++) {                                             \
       s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);                     \
       s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));              \
       s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));              \
       s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));              \
       r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);                     \
       r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));              \
       r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));              \
       r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));              \
       if (sec_ptr) {                                                           \
         r[0] = _mm256_avg_epu16(r[0],                                          \
                                 _mm256_loadu_si256((const __m256i *)sec_ptr)); \
         r[1] = _mm256_avg_epu16(                                               \
             r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));        \
         r[2] = _mm256_avg_epu16(                                               \
             r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));        \
         r[3] = _mm256_avg_epu16(                                               \
             r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));        \
         sec_ptr += 64;                                                         \
       }                                                                        \
       highbd_sad16x4_core_avx2(s, r, sad_acc);                                 \
       src_ptr += 64;                                                           \
       ref_ptr += 64;                                                           \
     }                                                                          \
   }

 MAKE_SAD_WX1(128);
 MAKE_SAD_WX1(256);

 #define MAKE_SAD_WX1_4D(w)                                        \
   static AOM_FORCE_INLINE void sad##w##x1_4d(                     \
       __m256i *s, const uint16_t *ref_ptr, __m256i *sad_acc) {    \
     assert(w % 64 == 0);                                          \
     __m256i r[4];                                                 \
     int i;                                                        \
     for (i = 0; i < w / 64; i++) {                                \
       r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);        \
       r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); \
       r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); \
       r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); \
       highbd_sad16x4_core_avx2(s + i * 4, r, sad_acc);            \
       ref_ptr += 64;                                              \
     }                                                             \
   }

 MAKE_SAD_WX1_4D(128);
 MAKE_SAD_WX1_4D(256);

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
     int N, const uint16_t *src, int src_stride, const uint16_t *ref,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   int row = 0;
   while (row < N) {
     sad128x1(src, ref, NULL, &sad);
     src += src_stride;
     ref += ref_stride;
     row++;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 static AOM_FORCE_INLINE unsigned int aom_highbd_sad256xN_avx2(
     int N, const uint16_t *src, int src_stride, const uint16_t *ref,
     int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   int row = 0;
   while (row < N) {
     sad256x1(src, ref, NULL, &sad);
     src += src_stride;
     ref += ref_stride;
     row++;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 #define highbd_sadMxN_avx2(m, n)                                            \
   unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
       const uint16_t *src, int src_stride, const uint16_t *ref,             \
       int ref_stride) {                                                     \
     return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
   }

 #define highbd_sadMxN_ds_avx2(m, n)                                            \
   unsigned int aom_highbd_sad##m##x##n##_ds_avx2(                              \
       const uint16_t *src, int src_stride, const uint16_t *ref,                \
       int ref_stride) {                                                        \
     return aom_highbd_sad##m##xN_ds_avx2(n, src, src_stride, ref, ref_stride); \
   }

 #define highbd_sad_skip_MxN_avx2(m, n)                                       \
   unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                         \
       const uint16_t *src, int src_stride, const uint16_t *ref,              \
       int ref_stride) {                                                      \
     return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
                                           2 * ref_stride);                   \
   }

 // Handle height 4 cases where only 2 rows are processed
 #define highbd_sad_skip_MxN_2rows_avx2(m, n)                                  \
   unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                          \
       const uint16_t *src, int src_stride, const uint16_t *ref,               \
       int ref_stride) {                                                       \
     return 2 * aom_highbd_sad##m##xN_2rows_avx2((n / 2), src, 2 * src_stride, \
                                                 ref, 2 * ref_stride);         \
   }

 highbd_sadMxN_avx2(16, 4);
 highbd_sadMxN_avx2(16, 8);
 highbd_sadMxN_avx2(16, 16);
 highbd_sadMxN_avx2(16, 32);
 highbd_sadMxN_avx2(16, 64);

 highbd_sadMxN_ds_avx2(16, 8);
 highbd_sadMxN_ds_avx2(16, 16);
 highbd_sadMxN_ds_avx2(8, 8);
 highbd_sadMxN_ds_avx2(8, 16);
 highbd_sadMxN_ds_avx2(12, 20);
 highbd_sadMxN_ds_avx2(20, 12);
 highbd_sadMxN_ds_avx2(12, 12);
 highbd_sadMxN_ds_avx2(20, 20);

 highbd_sadMxN_avx2(20, 20);
 highbd_sadMxN_avx2(20, 12);
 highbd_sadMxN_avx2(12, 20);
 highbd_sadMxN_avx2(12, 12);

 highbd_sadMxN_avx2(32, 4);
 highbd_sadMxN_avx2(32, 8);
 highbd_sadMxN_avx2(32, 16);
 highbd_sadMxN_avx2(32, 32);
 highbd_sadMxN_avx2(32, 64);

 highbd_sadMxN_avx2(64, 4);
 highbd_sadMxN_avx2(64, 8);
 highbd_sadMxN_avx2(64, 16);
 highbd_sadMxN_avx2(64, 32);
 highbd_sadMxN_avx2(64, 64);
 highbd_sadMxN_avx2(64, 128);

 highbd_sadMxN_avx2(128, 64);
 highbd_sadMxN_avx2(128, 128);
 highbd_sadMxN_avx2(128, 256);

 highbd_sadMxN_avx2(256, 128);
 highbd_sadMxN_avx2(256, 256);

 highbd_sad_skip_MxN_2rows_avx2(16, 4);
 highbd_sad_skip_MxN_avx2(16, 8);
 highbd_sad_skip_MxN_avx2(16, 16);
 highbd_sad_skip_MxN_avx2(16, 32);
 highbd_sad_skip_MxN_avx2(16, 64);

 highbd_sad_skip_MxN_2rows_avx2(32, 4);
 highbd_sad_skip_MxN_avx2(32, 8);
 highbd_sad_skip_MxN_avx2(32, 16);
 highbd_sad_skip_MxN_avx2(32, 32);
 highbd_sad_skip_MxN_avx2(32, 64);

 highbd_sad_skip_MxN_avx2(64, 4);
 highbd_sad_skip_MxN_avx2(64, 8);
 highbd_sad_skip_MxN_avx2(64, 16);
 highbd_sad_skip_MxN_avx2(64, 32);
 highbd_sad_skip_MxN_avx2(64, 64);
 highbd_sad_skip_MxN_avx2(64, 128);

 highbd_sad_skip_MxN_avx2(128, 64);
 highbd_sad_skip_MxN_avx2(128, 128);
 highbd_sad_skip_MxN_avx2(128, 256);

 highbd_sad_skip_MxN_avx2(256, 128);
 highbd_sad_skip_MxN_avx2(256, 256);

 unsigned int aom_highbd_sad16x4_avg_avx2(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   sad16x4(src, src_stride, ref, ref_stride, second_pred, &sad);

   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad16x8_avg_avx2(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();

   sad16x4(src, src_stride, ref, ref_stride, second_pred, &sad);

   // Next 4 rows
   src += src_stride << 2;
   ref += ref_stride << 2;
   second_pred += 64;
   sad16x4(src, src_stride, ref, ref_stride, second_pred, &sad);
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad16x16_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 3;
   uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
                                              second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 16 << left_shift;
   sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
                                      second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad16x32_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 4;
   uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 16 << left_shift;
   sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad16x64_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 5;
   uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 16 << left_shift;
   sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad32x8_avg_avx2(const uint16_t *src, int src_stride,
                                          const uint16_t *ref, int ref_stride,
                                          const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 2;
   int row_section = 0;

   while (row_section < 2) {
     sad32x4(src, src_stride, ref, ref_stride, second_pred, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
     second_pred += 32 << left_shift;
     row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad32x16_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 2;
   int row_section = 0;

   while (row_section < 4) {
     sad32x4(src, src_stride, ref, ref_stride, second_pred, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
     second_pred += 32 << left_shift;
     row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad32x32_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 4;
   uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 32 << left_shift;
   sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad32x64_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 5;
   uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 32 << left_shift;
   sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad64x16_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 1;
   int row_section = 0;

   while (row_section < 8) {
     sad64x2(src, src_stride, ref, ref_stride, second_pred, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
     second_pred += 64 << left_shift;
     row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad64x32_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   const int left_shift = 1;
   int row_section = 0;

   while (row_section < 16) {
     sad64x2(src, src_stride, ref, ref_stride, second_pred, &sad);
     src += src_stride << left_shift;
     ref += ref_stride << left_shift;
     second_pred += 64 << left_shift;
     row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad64x64_avg_avx2(const uint16_t *src, int src_stride,
                                           const uint16_t *ref, int ref_stride,
                                           const uint16_t *second_pred) {
   const int left_shift = 5;
   uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 64 << left_shift;
   sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad64x128_avg_avx2(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            const uint16_t *second_pred) {
   const int left_shift = 6;
   uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
                                               second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 64 << left_shift;
   sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad128x64_avg_avx2(const uint16_t *src, int src_stride,
                                            const uint16_t *ref, int ref_stride,
                                            const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   int row = 0;
   while (row < 64) {
     sad128x1(src, ref, second_pred, &sad);
     src += src_stride;
     ref += ref_stride;
     second_pred += 16 << 3;
     row += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad128x128_avg_avx2(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             const uint16_t *second_pred) {
   unsigned int sum;
   const int left_shift = 6;

   sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
                                       second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 128 << left_shift;
   sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
                                        second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad128x256_avg_avx2(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             const uint16_t *second_pred) {
   unsigned int sum;
   const int left_shift = 7;

   sum = aom_highbd_sad128x128_avg_avx2(src, src_stride, ref, ref_stride,
                                        second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 128 << left_shift;
   sum += aom_highbd_sad128x128_avg_avx2(src, src_stride, ref, ref_stride,
                                         second_pred);
   return sum;
 }

 unsigned int aom_highbd_sad256x128_avg_avx2(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             const uint16_t *second_pred) {
   __m256i sad = _mm256_setzero_si256();
   int row = 0;
   while (row < 128) {
     sad256x1(src, ref, second_pred, &sad);
     src += src_stride;
     ref += ref_stride;
     second_pred += 256;
     row += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }

 unsigned int aom_highbd_sad256x256_avg_avx2(const uint16_t *src, int src_stride,
                                             const uint16_t *ref, int ref_stride,
                                             const uint16_t *second_pred) {
   unsigned int sum;
   const int left_shift = 7;

   sum = aom_highbd_sad256x128_avg_avx2(src, src_stride, ref, ref_stride,
                                        second_pred);
   src += src_stride << left_shift;
   ref += ref_stride << left_shift;
   second_pred += 256 << left_shift;
   sum += aom_highbd_sad256x128_avg_avx2(src, src_stride, ref, ref_stride,
                                         second_pred);
   return sum;
 }

 // SAD 4D
 // Combine 4 __m256i input vectors  v to uint32_t result[4]
 static AOM_FORCE_INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                          uint32_t *res) {
   __m256i u0, u1, u2, u3;
   const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
   __m128i sad;

   // 8 32-bit summation
   u0 = _mm256_srli_si256(v[0], 4);
   u1 = _mm256_srli_si256(v[1], 4);
   u2 = _mm256_srli_si256(v[2], 4);
   u3 = _mm256_srli_si256(v[3], 4);

   u0 = _mm256_add_epi32(u0, v[0]);
   u1 = _mm256_add_epi32(u1, v[1]);
   u2 = _mm256_add_epi32(u2, v[2]);
   u3 = _mm256_add_epi32(u3, v[3]);

   u0 = _mm256_and_si256(u0, mask);
   u1 = _mm256_and_si256(u1, mask);
   u2 = _mm256_and_si256(u2, mask);
   u3 = _mm256_and_si256(u3, mask);
   // 4 32-bit summation, evenly positioned

   u1 = _mm256_slli_si256(u1, 4);
   u3 = _mm256_slli_si256(u3, 4);

   u0 = _mm256_or_si256(u0, u1);
   u2 = _mm256_or_si256(u2, u3);
   // 8 32-bit summation, interleaved

   u1 = _mm256_unpacklo_epi64(u0, u2);
   u3 = _mm256_unpackhi_epi64(u0, u2);

   u0 = _mm256_add_epi32(u1, u3);
   sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
                       _mm256_castsi256_si128(u0));
   _mm_storeu_si128((__m128i *)res, sad);
 }

 static AOM_FORCE_INLINE void init_sad(__m256i *s) {
   s[0] = _mm256_setzero_si256();
   s[1] = _mm256_setzero_si256();
   s[2] = _mm256_setzero_si256();
   s[3] = _mm256_setzero_si256();
 }

 static AOM_FORCE_INLINE void aom_highbd_sad8xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[2];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j += 4) {
     const __m128i a = _mm_loadu_si128((const __m128i *)(src16));
     const __m128i b = _mm_loadu_si128((const __m128i *)(src16 + src_stride));
     const __m128i c =
         _mm_loadu_si128((const __m128i *)(src16 + 2 * src_stride));
     const __m128i d =
         _mm_loadu_si128((const __m128i *)(src16 + 3 * src_stride));

     s[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 0x1);
     s[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(c), d, 0x1);

     for (i = 0; i < 4; ++i) {
       sad16x2_4d(s, ref_array[i] + j * ref_stride, ref_stride, &sad_vec[i]);
     }
     src16 += 4 * src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_2rows_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const int shift_for_2_rows = 1;

   init_sad(sad_vec);

   for (int i = 0; i < 4; ++i) {
     const uint16_t *srcp = src;
     const uint16_t *refp = ref_array[i];

     for (int j = 0; j < N; j += 2) {
       sad16x2(srcp, src_stride, refp, ref_stride, 0, &sad_vec[i]);
       srcp += src_stride << shift_for_2_rows;
       refp += ref_stride << shift_for_2_rows;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[4];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j += 4) {
     s[0] = _mm256_loadu_si256((const __m256i *)(src16));
     s[1] = _mm256_loadu_si256((const __m256i *)(src16 + src_stride));
     s[2] = _mm256_loadu_si256((const __m256i *)(src16 + 2 * src_stride));
     s[3] = _mm256_loadu_si256((const __m256i *)(src16 + 3 * src_stride));
     for (i = 0; i < 4; ++i) {
       sad16x4_4d(s, ref_array[i] + j * ref_stride, ref_stride, &sad_vec[i]);
     }
     src16 += 4 * src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[8];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j += 4) {
     s[0] = _mm256_loadu_si256((const __m256i *)(src16));
     s[1] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     src16 += src_stride;
     s[2] = _mm256_loadu_si256((const __m256i *)(src16));
     s[3] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     src16 += src_stride;
     s[4] = _mm256_loadu_si256((const __m256i *)(src16));
     s[5] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     src16 += src_stride;
     s[6] = _mm256_loadu_si256((const __m256i *)(src16));
     s[7] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     for (i = 0; i < 4; ++i) {
       sad32x4_4d(s, ref_array[i] + j * ref_stride, ref_stride, &sad_vec[i]);
     }
     src16 += src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_2rows_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const int shift_for_2_rows = 1;

   init_sad(sad_vec);

   for (int i = 0; i < 4; ++i) {
     const uint16_t *srcp = src;
     const uint16_t *refp = ref_array[i];

     for (int r = 0; r < N; r += 2) {
       sad32x2(srcp, src_stride, refp, ref_stride, 0, &sad_vec[i]);
       srcp += src_stride << shift_for_2_rows;
       refp += ref_stride << shift_for_2_rows;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[8];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j += 2) {
     s[0] = _mm256_loadu_si256((const __m256i *)(src16));
     s[1] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     s[2] = _mm256_loadu_si256((const __m256i *)(src16 + 32));
     s[3] = _mm256_loadu_si256((const __m256i *)(src16 + 48));
     src16 += src_stride;
     s[4] = _mm256_loadu_si256((const __m256i *)(src16));
     s[5] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     s[6] = _mm256_loadu_si256((const __m256i *)(src16 + 32));
     s[7] = _mm256_loadu_si256((const __m256i *)(src16 + 48));
     for (i = 0; i < 4; ++i) {
       sad64x2_4d(s, ref_array[i] + j * ref_stride, ref_stride, &sad_vec[i]);
     }
     src16 += src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[8];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j++) {
     s[0] = _mm256_loadu_si256((const __m256i *)(src16));
     s[1] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     s[2] = _mm256_loadu_si256((const __m256i *)(src16 + 32));
     s[3] = _mm256_loadu_si256((const __m256i *)(src16 + 48));
     s[4] = _mm256_loadu_si256((const __m256i *)(src16 + 64));
     s[5] = _mm256_loadu_si256((const __m256i *)(src16 + 80));
     s[6] = _mm256_loadu_si256((const __m256i *)(src16 + 96));
     s[7] = _mm256_loadu_si256((const __m256i *)(src16 + 112));
     for (i = 0; i < 4; ++i) {
       sad128x1_4d(s, ref_array[i] + j * ref_stride, &sad_vec[i]);
     }
     src16 += src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 static AOM_FORCE_INLINE void aom_highbd_sad256xNx4d_avx2(
     int N, const uint16_t *src, int src_stride,
     const uint16_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   __m256i s[16];
   int i, j;

   init_sad(sad_vec);

   const uint16_t *src16 = src;
   for (j = 0; j < N; j++) {
     s[0] = _mm256_loadu_si256((const __m256i *)(src16));
     s[1] = _mm256_loadu_si256((const __m256i *)(src16 + 16));
     s[2] = _mm256_loadu_si256((const __m256i *)(src16 + 32));
     s[3] = _mm256_loadu_si256((const __m256i *)(src16 + 48));
     s[4] = _mm256_loadu_si256((const __m256i *)(src16 + 64));
     s[5] = _mm256_loadu_si256((const __m256i *)(src16 + 80));
     s[6] = _mm256_loadu_si256((const __m256i *)(src16 + 96));
     s[7] = _mm256_loadu_si256((const __m256i *)(src16 + 112));
     s[8] = _mm256_loadu_si256((const __m256i *)(src16 + 128));
     s[9] = _mm256_loadu_si256((const __m256i *)(src16 + 144));
     s[10] = _mm256_loadu_si256((const __m256i *)(src16 + 160));
     s[11] = _mm256_loadu_si256((const __m256i *)(src16 + 176));
     s[12] = _mm256_loadu_si256((const __m256i *)(src16 + 192));
     s[13] = _mm256_loadu_si256((const __m256i *)(src16 + 208));
     s[14] = _mm256_loadu_si256((const __m256i *)(src16 + 224));
     s[15] = _mm256_loadu_si256((const __m256i *)(src16 + 240));
     for (i = 0; i < 4; ++i) {
       sad256x1_4d(s, ref_array[i] + j * ref_stride, &sad_vec[i]);
     }
     src16 += src_stride;
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }

 #define highbd_sadMxNx4d_avx2(m, n)                                           \
   void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
       const uint16_t *src, int src_stride, const uint16_t *const ref_array[], \
       int ref_stride, uint32_t *sad_array) {                                  \
     aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride,  \
                                   sad_array);                                 \
   }
 #define highbd_sad_skip_MxNx4d_avx2(m, n)                                     \
   void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                               \
       const uint16_t *src, int src_stride, const uint16_t *const ref_array[], \
       int ref_stride, uint32_t *sad_array) {                                  \
     aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array,    \
                                   2 * ref_stride, sad_array);                 \
     sad_array[0] <<= 1;                                                       \
     sad_array[1] <<= 1;                                                       \
     sad_array[2] <<= 1;                                                       \
     sad_array[3] <<= 1;                                                       \
   }

 // Handle height 4 cases where only 2 rows are processed
 #define highbd_sad_skip_MxNx4d_2rows_avx2(m, n)                                \
   void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                                \
       const uint16_t *src, int src_stride, const uint16_t *const ref_array[],  \
       int ref_stride, uint32_t *sad_array) {                                   \
     aom_highbd_sad##m##xNx4d_2rows_avx2((n / 2), src, 2 * src_stride,          \
                                         ref_array, 2 * ref_stride, sad_array); \
     sad_array[0] <<= 1;                                                        \
     sad_array[1] <<= 1;                                                        \
     sad_array[2] <<= 1;                                                        \
     sad_array[3] <<= 1;                                                        \
   }

 highbd_sadMxNx4d_avx2(8, 4);
 highbd_sadMxNx4d_avx2(8, 8);
 highbd_sadMxNx4d_avx2(8, 16);
 highbd_sadMxNx4d_avx2(8, 32);
 highbd_sadMxNx4d_avx2(8, 64);

 highbd_sadMxNx4d_avx2(16, 4);
 highbd_sadMxNx4d_avx2(16, 8);
 highbd_sadMxNx4d_avx2(16, 16);
 highbd_sadMxNx4d_avx2(16, 32);
 highbd_sadMxNx4d_avx2(16, 64);

 highbd_sadMxNx4d_avx2(32, 4);
 highbd_sadMxNx4d_avx2(32, 8);
 highbd_sadMxNx4d_avx2(32, 16);
 highbd_sadMxNx4d_avx2(32, 32);
 highbd_sadMxNx4d_avx2(32, 64);

 highbd_sadMxNx4d_avx2(64, 4);
 highbd_sadMxNx4d_avx2(64, 8);
 highbd_sadMxNx4d_avx2(64, 16);
 highbd_sadMxNx4d_avx2(64, 32);
 highbd_sadMxNx4d_avx2(64, 64);
 highbd_sadMxNx4d_avx2(64, 128);

 highbd_sadMxNx4d_avx2(128, 64);
 highbd_sadMxNx4d_avx2(128, 128);
 highbd_sadMxNx4d_avx2(128, 256);

 highbd_sadMxNx4d_avx2(256, 128);
 highbd_sadMxNx4d_avx2(256, 256);

 highbd_sad_skip_MxNx4d_avx2(8, 8);
 highbd_sad_skip_MxNx4d_avx2(8, 16);
 highbd_sad_skip_MxNx4d_avx2(8, 32);
 highbd_sad_skip_MxNx4d_avx2(8, 64);

 highbd_sad_skip_MxNx4d_2rows_avx2(16, 4);
 highbd_sad_skip_MxNx4d_avx2(16, 8);
 highbd_sad_skip_MxNx4d_avx2(16, 16);
 highbd_sad_skip_MxNx4d_avx2(16, 32);
 highbd_sad_skip_MxNx4d_avx2(16, 64);

 highbd_sad_skip_MxNx4d_2rows_avx2(32, 4);
 highbd_sad_skip_MxNx4d_avx2(32, 8);
 highbd_sad_skip_MxNx4d_avx2(32, 16);
 highbd_sad_skip_MxNx4d_avx2(32, 32);
 highbd_sad_skip_MxNx4d_avx2(32, 64);

 highbd_sad_skip_MxNx4d_avx2(64, 4);
 highbd_sad_skip_MxNx4d_avx2(64, 8);
 highbd_sad_skip_MxNx4d_avx2(64, 16);
 highbd_sad_skip_MxNx4d_avx2(64, 32);
 highbd_sad_skip_MxNx4d_avx2(64, 64);
 highbd_sad_skip_MxNx4d_avx2(64, 128);

 highbd_sad_skip_MxNx4d_avx2(128, 64);
 highbd_sad_skip_MxNx4d_avx2(128, 128);
 highbd_sad_skip_MxNx4d_avx2(128, 256);

 highbd_sad_skip_MxNx4d_avx2(256, 128);
 highbd_sad_skip_MxNx4d_avx2(256, 256);