av1/common/x86/optflow_refine_sse4.c - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 #include <smmintrin.h>  /* SSE4.1 */

 #include "aom/aom_integer.h"
 #include "av1/common/blockd.h"
 #include "av1/common/reconinter.h"
 #include "aom_dsp/x86/synonyms.h"

 static INLINE __m128i round_power_of_two_signed_epi64(__m128i in, int n) {
   __m128i sign_mask = _mm_srai_epi32(in, 31);
   sign_mask = _mm_or_si128(sign_mask, _mm_srli_epi64(sign_mask, 4));
   __m128i abs_vec = _mm_xor_si128(in, sign_mask);
   abs_vec = _mm_sub_epi64(abs_vec, sign_mask);

   __m128i rounding_offset = _mm_set_epi32(0, (1 << n) >> 1, 0, (1 << n) >> 1);
   __m128i add_vec = _mm_add_epi64(abs_vec, rounding_offset);
   __m128i rounded_vec = _mm_srli_epi64(add_vec, n);

   // Restore the sign
   rounded_vec = _mm_xor_si128(rounded_vec, sign_mask);
   rounded_vec = _mm_sub_epi64(rounded_vec, sign_mask);
   return rounded_vec;
 }

 static INLINE __m128i pack_and_round_epi32(__m128i temp1, __m128i temp2,
                                            const __m128i v_bias_d,
                                            const __m128i ones, const int bits) {
   __m128i v_sign_d = _mm_sign_epi32(ones, temp1);
   __m128i reg = _mm_mullo_epi32(temp1, v_sign_d);
   reg = _mm_srli_epi32(_mm_add_epi32(reg, v_bias_d), bits);
   temp1 = _mm_mullo_epi32(reg, v_sign_d);

   v_sign_d = _mm_sign_epi32(ones, temp2);
   reg = _mm_mullo_epi32(temp2, v_sign_d);
   reg = _mm_srli_epi32(_mm_add_epi32(reg, v_bias_d), bits);
   temp2 = _mm_mullo_epi32(reg, v_sign_d);
   return (_mm_packs_epi32(temp1, temp2));
 }

 static INLINE __m128i clamp_epi16(__m128i val, const int min_val,
                                   const int max_val) {
   const __m128i min = _mm_set1_epi16(min_val);
   const __m128i max = _mm_set1_epi16(max_val);
   return _mm_min_epi16(_mm_max_epi16(val, min), max);
 }

 void av1_bicubic_grad_interpolation_highbd_sse4_1(const int16_t *pred_src,
                                                   int16_t *x_grad,
                                                   int16_t *y_grad,
                                                   const int stride,
                                                   const int bw, const int bh) {
 #if OPFL_BICUBIC_GRAD
   assert(bw % 8 == 0);
   assert(bh % 8 == 0);

   __m128i coeff_bi[4][2];
   coeff_bi[0][0] = _mm_set1_epi32(coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][0]);
   coeff_bi[0][1] = _mm_set1_epi32(coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][0]);
   coeff_bi[1][0] = _mm_set1_epi32(coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][1]);
   coeff_bi[1][1] = _mm_set1_epi32(coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][1]);
   coeff_bi[2][0] = _mm_insert_epi32(
       coeff_bi[0][0], coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][1], 0);
   coeff_bi[2][1] = _mm_insert_epi32(
       coeff_bi[0][1], coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][1], 0);
   coeff_bi[3][0] = _mm_insert_epi32(
       coeff_bi[0][0], coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][1], 3);
   coeff_bi[3][1] = _mm_insert_epi32(
       coeff_bi[0][1], coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][1], 3);
   const __m128i v_bias_d = _mm_set1_epi32((1 << bicubic_bits) >> 1);
   const __m128i ones = _mm_set1_epi32(1);

   if (bw < 16) {
     for (int col = 0; col < bh; col++) {
       const int is_y_boundary = (col + 1 > bh - 1 || col - 1 < 0);
       const int id_prev1 = AOMMAX(col - 1, 0);
       const int id_prev2 = AOMMAX(col - 2, 0);
       const int id_next1 = AOMMIN(col + 1, bh - 1);
       const int id_next2 = AOMMIN(col + 2, bh - 1);

       for (int row = 0; row < bw; row += 8) {
         __m128i vpred_next1, vpred_prev1, vpred_next2, vpred_prev2;
         __m128i temp1, temp2, sub1, sub2, sub3, sub4;
         const int16_t *src = &pred_src[col * bw + row];
         vpred_prev1 =
             _mm_set_epi16(*(src + 6), *(src + 5), *(src + 4), *(src + 3),
                           *(src + 2), *(src + 1), *src, *src);
         vpred_prev2 = _mm_set_epi16(*(src + 5), *(src + 4), *(src + 3),
                                     *(src + 2), *(src + 1), *src, *src, *src);
         vpred_next1 =
             _mm_set_epi16(*(src + 7), *(src + 7), *(src + 6), *(src + 5),
                           *(src + 4), *(src + 3), *(src + 2), *(src + 1));
         vpred_next2 =
             _mm_set_epi16(*(src + 7), *(src + 7), *(src + 7), *(src + 6),
                           *(src + 5), *(src + 4), *(src + 3), *(src + 2));

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1),
                              _mm_cvtepi16_epi32(vpred_prev1));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2),
                              _mm_cvtepi16_epi32(vpred_prev2));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2, 8)));

         temp1 = _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[2][0]),
                               _mm_mullo_epi32(sub3, coeff_bi[2][1]));
         temp2 = _mm_add_epi32(_mm_mullo_epi32(sub2, coeff_bi[3][0]),
                               _mm_mullo_epi32(sub4, coeff_bi[3][1]));

         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         const int idx = col * stride + row;
         xx_storeu_128(x_grad + idx, temp1);

         src = pred_src + row;
         vpred_prev1 = xx_loadu_128(src + id_prev1 * stride);
         vpred_prev2 = xx_loadu_128(src + id_prev2 * stride);
         vpred_next1 = xx_loadu_128(src + id_next1 * stride);
         vpred_next2 = xx_loadu_128(src + id_next2 * stride);

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1),
                              _mm_cvtepi16_epi32(vpred_prev1));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2),
                              _mm_cvtepi16_epi32(vpred_prev2));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2, 8)));

         temp1 =
             _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub3, coeff_bi[is_y_boundary][1]));
         temp2 =
             _mm_add_epi32(_mm_mullo_epi32(sub2, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub4, coeff_bi[is_y_boundary][1]));

         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         xx_storeu_128(y_grad + idx, temp1);
       }
     }
   } else {
     for (int col = 0; col < bh; col++) {
       const int is_y_boundary = (col + 1 > bh - 1 || col - 1 < 0);
       const int id_prev = AOMMAX(col - 1, 0);
       const int id_prev2 = AOMMAX(col - 2, 0);
       const int id_next = AOMMIN(col + 1, bh - 1);
       const int id_next2 = AOMMIN(col + 2, bh - 1);
       for (int row = 0; row < bw; row += 16) {
         __m128i vpred_next1_1, vpred_prev1_1, vpred_next2_1, vpred_prev2_1;
         __m128i vpred_next1_2, vpred_prev1_2, vpred_next2_2, vpred_prev2_2;
         __m128i temp1, temp2;
         __m128i sub1, sub2, sub3, sub4;

         const int16_t *src = &pred_src[col * stride + row];

         if (row - 1 < 0) {
           vpred_prev1_1 =
               _mm_set_epi16(*(src + 6), *(src + 5), *(src + 4), *(src + 3),
                             *(src + 2), *(src + 1), *src, *src);
           vpred_prev2_1 =
               _mm_set_epi16(*(src + 5), *(src + 4), *(src + 3), *(src + 2),
                             *(src + 1), *src, *src, *src);
         } else {
           vpred_prev1_1 = xx_loadu_128((__m128i *)(src - 1));
           vpred_prev2_1 = xx_loadu_128((__m128i *)(src - 2));
         }
         if (row + 16 > bw - 1) {
           vpred_next1_2 =
               _mm_set_epi16(*(src + 15), *(src + 15), *(src + 14), *(src + 13),
                             *(src + 12), *(src + 11), *(src + 10), *(src + 9));
           vpred_next2_2 =
               _mm_set_epi16(*(src + 15), *(src + 15), *(src + 15), *(src + 14),
                             *(src + 13), *(src + 12), *(src + 11), *(src + 10));
         } else {
           vpred_next1_2 = xx_loadu_128(src + 9);
           vpred_next2_2 = xx_loadu_128(src + 10);
         }
         vpred_prev1_2 = xx_loadu_128(src + 7);
         vpred_prev2_2 = xx_loadu_128(src + 6);
         vpred_next1_1 = xx_loadu_128(src + 1);
         vpred_next2_1 = xx_loadu_128(src + 2);

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1_1),
                              _mm_cvtepi16_epi32(vpred_prev1_1));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1_1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1_1, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2_1),
                              _mm_cvtepi16_epi32(vpred_prev2_1));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2_1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2_1, 8)));

         const int is_left_boundary = row - 1 < 0 ? 2 : 0;
         const int is_right_boundary = row + 16 > bw - 1 ? 3 : 0;
         temp1 =
             _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[is_left_boundary][0]),
                           _mm_mullo_epi32(sub3, coeff_bi[is_left_boundary][1]));
         temp2 = _mm_add_epi32(_mm_mullo_epi32(sub2, coeff_bi[0][0]),
                               _mm_mullo_epi32(sub4, coeff_bi[0][1]));

         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         const int idx = col * stride + row;
         xx_storeu_128(x_grad + idx, temp1);

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1_2),
                              _mm_cvtepi16_epi32(vpred_prev1_2));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1_2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1_2, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2_2),
                              _mm_cvtepi16_epi32(vpred_prev2_2));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2_2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2_2, 8)));

         temp1 = _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[0][0]),
                               _mm_mullo_epi32(sub3, coeff_bi[0][1]));
         temp2 = _mm_add_epi32(
             _mm_mullo_epi32(sub2, coeff_bi[is_right_boundary][0]),
             _mm_mullo_epi32(sub4, coeff_bi[is_right_boundary][1]));

         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         xx_storeu_128(x_grad + idx + 8, temp1);

         src = pred_src + row;
         vpred_prev1_1 = xx_loadu_128(src + stride * id_prev);
         vpred_prev2_1 = xx_loadu_128(src + stride * id_prev2);
         vpred_next1_1 = xx_loadu_128(src + id_next * stride);
         vpred_next2_1 = xx_loadu_128(src + id_next2 * stride);

         vpred_prev1_2 = xx_loadu_128(src + stride * id_prev + 8);
         vpred_prev2_2 = xx_loadu_128(src + stride * id_prev2 + 8);
         vpred_next1_2 = xx_loadu_128(src + id_next * stride + 8);
         vpred_next2_2 = xx_loadu_128(src + id_next2 * stride + 8);

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1_1),
                              _mm_cvtepi16_epi32(vpred_prev1_1));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1_1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1_1, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2_1),
                              _mm_cvtepi16_epi32(vpred_prev2_1));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2_1, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2_1, 8)));

         temp1 =
             _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub3, coeff_bi[is_y_boundary][1]));
         temp2 =
             _mm_add_epi32(_mm_mullo_epi32(sub2, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub4, coeff_bi[is_y_boundary][1]));

         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         xx_storeu_128(y_grad + idx, temp1);

         sub1 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next1_2),
                              _mm_cvtepi16_epi32(vpred_prev1_2));
         sub2 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next1_2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev1_2, 8)));
         sub3 = _mm_sub_epi32(_mm_cvtepi16_epi32(vpred_next2_2),
                              _mm_cvtepi16_epi32(vpred_prev2_2));
         sub4 =
             _mm_sub_epi32(_mm_cvtepi16_epi32(_mm_srli_si128(vpred_next2_2, 8)),
                           _mm_cvtepi16_epi32(_mm_srli_si128(vpred_prev2_2, 8)));
         temp1 =
             _mm_add_epi32(_mm_mullo_epi32(sub1, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub3, coeff_bi[is_y_boundary][1]));
         temp2 =
             _mm_add_epi32(_mm_mullo_epi32(sub2, coeff_bi[is_y_boundary][0]),
                           _mm_mullo_epi32(sub4, coeff_bi[is_y_boundary][1]));
         temp1 =
             pack_and_round_epi32(temp1, temp2, v_bias_d, ones, bicubic_bits);
         temp1 = clamp_epi16(temp1, -OPFL_GRAD_CLAMP_VAL, OPFL_GRAD_CLAMP_VAL);
         xx_storeu_128(y_grad + idx + 8, temp1);
       }
     }
   }
 #else
   (void)pred_src;
   (void)x_grad;
   (void)y_grad;
   (void)bw;
   (void)bh;
 #endif  // OPFL_BICUBIC_GRAD
 }

 static INLINE __m128i LoadUnaligned16(const void *a) {
   return _mm_loadu_si128((const __m128i *)a);
 }

 static AOM_FORCE_INLINE void multiply_and_accum(
     __m128i a_lo_0, __m128i b_lo_0, __m128i a_hi_0, __m128i b_hi_0,
     __m128i a_lo1, __m128i b_lo1, __m128i a_hi1, __m128i b_hi1,
     const int grad_bits_lo, const int grad_bits_hi, __m128i *t1, __m128i *t2) {
   const __m128i reg_lo_0 = round_power_of_two_signed_epi64(
       _mm_mul_epi32(a_lo_0, b_lo_0), grad_bits_lo);
   const __m128i reg_hi_0 = round_power_of_two_signed_epi64(
       _mm_mul_epi32(a_hi_0, b_hi_0), grad_bits_hi);
   const __m128i reg_lo1 = round_power_of_two_signed_epi64(
       _mm_mul_epi32(a_lo1, b_lo1), grad_bits_lo);
   const __m128i reg_hi1 = round_power_of_two_signed_epi64(
       _mm_mul_epi32(a_hi1, b_hi1), grad_bits_hi);
   *t1 = _mm_add_epi64(reg_lo_0, reg_lo1);
   *t2 = _mm_add_epi64(reg_hi_0, reg_hi1);
 }

 static void opfl_mv_refinement_8x4_sse4_1(const int16_t *pdiff, int pstride,
                                           const int16_t *gx, const int16_t *gy,
                                           int gstride, int d0, int d1,
                                           int grad_prec_bits, int mv_prec_bits,
                                           int *vx0, int *vy0, int *vx1,
                                           int *vy1) {
   int bHeight = 4;
   const int bits = mv_prec_bits + grad_prec_bits;
   const int rls_alpha = OPFL_RLS_PARAM;
   __m128i u2_lo, v2_lo, uv_lo, uw_lo, vw_lo;
   __m128i u2_hi, v2_hi, uv_hi, uw_hi, vw_hi;
   int32_t su2_hi = 0;
   int32_t sv2_hi = 0;
   int32_t suv_hi = 0;
   int32_t suw_hi = 0;
   int32_t svw_hi = 0;
   int32_t su2_lo = 0;
   int32_t sv2_lo = 0;
   int32_t suv_lo = 0;
   int32_t suw_lo = 0;
   int32_t svw_lo = 0;
   // TODO(kslu) clean up all grad_bits if later it is still not needed
   int grad_bits_lo = 0;
   int grad_bits_hi = 0;
   do {
     __m128i gradX = LoadUnaligned16(gx);
     __m128i gradY = LoadUnaligned16(gy);
     __m128i pred = LoadUnaligned16(pdiff);
     // The precision of gx, gy and pred (i.e. d0*p0-d1*p1) buffers is signed
     // 16bit and there are cases where these buffers can be filled with extreme
     // values. Hence, the accumulation here needs to be done at 64-bit precision
     // to avoid overflow issues.
     const __m128i gradX_lo_0 = _mm_cvtepi16_epi32(gradX);
     const __m128i gradY_lo_0 = _mm_cvtepi16_epi32(gradY);
     const __m128i pred_lo_0 = _mm_cvtepi16_epi32(pred);
     const __m128i gradX_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(gradX, 8));
     const __m128i gradY_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(gradY, 8));
     const __m128i pred_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));

     const __m128i gradX_lo1 = _mm_srli_si128(gradX_lo_0, 4);
     const __m128i gradX_hi1 = _mm_srli_si128(gradX_hi_0, 4);
     const __m128i gradY_lo1 = _mm_srli_si128(gradY_lo_0, 4);
     const __m128i gradY_hi1 = _mm_srli_si128(gradY_hi_0, 4);
     const __m128i pred_lo1 = _mm_srli_si128(pred_lo_0, 4);
     const __m128i pred_hi1 = _mm_srli_si128(pred_hi_0, 4);

     multiply_and_accum(gradX_lo_0, gradX_lo_0, gradX_hi_0, gradX_hi_0,
                        gradX_lo1, gradX_lo1, gradX_hi1, gradX_hi1, grad_bits_lo,
                        grad_bits_hi, &u2_lo, &u2_hi);
     multiply_and_accum(gradY_lo_0, gradY_lo_0, gradY_hi_0, gradY_hi_0,
                        gradY_lo1, gradY_lo1, gradY_hi1, gradY_hi1, grad_bits_lo,
                        grad_bits_hi, &v2_lo, &v2_hi);
     multiply_and_accum(gradX_lo_0, gradY_lo_0, gradX_hi_0, gradY_hi_0,
                        gradX_lo1, gradY_lo1, gradX_hi1, gradY_hi1, grad_bits_lo,
                        grad_bits_hi, &uv_lo, &uv_hi);
     multiply_and_accum(gradX_lo_0, pred_lo_0, gradX_hi_0, pred_hi_0, gradX_lo1,
                        pred_lo1, gradX_hi1, pred_hi1, grad_bits_lo,
                        grad_bits_hi, &uw_lo, &uw_hi);
     multiply_and_accum(gradY_lo_0, pred_lo_0, gradY_hi_0, pred_hi_0, gradY_lo1,
                        pred_lo1, gradY_hi1, pred_hi1, grad_bits_lo,
                        grad_bits_hi, &vw_lo, &vw_hi);

     gx += gstride;
     gy += gstride;
     pdiff += pstride;
     bHeight -= 1;

     int64_t temp;
     xx_storel_64(&temp, _mm_add_epi64(u2_lo, _mm_srli_si128(u2_lo, 8)));
     su2_lo += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(v2_lo, _mm_srli_si128(v2_lo, 8)));
     sv2_lo += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uv_lo, _mm_srli_si128(uv_lo, 8)));
     suv_lo += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uw_lo, _mm_srli_si128(uw_lo, 8)));
     suw_lo += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(vw_lo, _mm_srli_si128(vw_lo, 8)));
     svw_lo += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(u2_hi, _mm_srli_si128(u2_hi, 8)));
     su2_hi += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(v2_hi, _mm_srli_si128(v2_hi, 8)));
     sv2_hi += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uv_hi, _mm_srli_si128(uv_hi, 8)));
     suv_hi += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uw_hi, _mm_srli_si128(uw_hi, 8)));
     suw_hi += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(vw_hi, _mm_srli_si128(vw_hi, 8)));
     svw_hi += (int32_t)temp;
 #if !CONFIG_F107_GRADIENT_SIMPLIFY
     // Do a range check and add a downshift if range is getting close to the bit
     // depth cap.
     if (bHeight % 2 == 0) {
       if (get_msb_signed(AOMMAX(AOMMAX(su2_lo, sv2_lo),
                                 AOMMAX(abs(suw_lo), abs(svw_lo)))) >=
           MAX_OPFL_AUTOCORR_BITS - 2) {
         su2_lo = ROUND_POWER_OF_TWO_SIGNED(su2_lo, 1);
         sv2_lo = ROUND_POWER_OF_TWO_SIGNED(sv2_lo, 1);
         suv_lo = ROUND_POWER_OF_TWO_SIGNED(suv_lo, 1);
         suw_lo = ROUND_POWER_OF_TWO_SIGNED(suw_lo, 1);
         svw_lo = ROUND_POWER_OF_TWO_SIGNED(svw_lo, 1);
         grad_bits_lo++;
       }
       if (get_msb_signed(AOMMAX(AOMMAX(su2_hi, sv2_hi),
                                 AOMMAX(abs(suw_hi), abs(svw_hi)))) >=
           MAX_OPFL_AUTOCORR_BITS - 2) {
         su2_hi = ROUND_POWER_OF_TWO_SIGNED(su2_hi, 1);
         sv2_hi = ROUND_POWER_OF_TWO_SIGNED(sv2_hi, 1);
         suv_hi = ROUND_POWER_OF_TWO_SIGNED(suv_hi, 1);
         suw_hi = ROUND_POWER_OF_TWO_SIGNED(suw_hi, 1);
         svw_hi = ROUND_POWER_OF_TWO_SIGNED(svw_hi, 1);
         grad_bits_hi++;
       }
     }
 #endif  // !CONFIG_F107_GRADIENT_SIMPLIFY
   } while (bHeight != 0);

   calc_mv_process(su2_lo, sv2_lo, suv_lo, suw_lo, svw_lo, d0, d1, bits,
                   rls_alpha, vx0, vy0, vx1, vy1);
   calc_mv_process(su2_hi, sv2_hi, suv_hi, suw_hi, svw_hi, d0, d1, bits,
                   rls_alpha, vx0 + 1, vy0 + 1, vx1 + 1, vy1 + 1);
 }

 static void opfl_mv_refinement_8x8_sse4_1(const int16_t *pdiff, int pstride,
                                           const int16_t *gx, const int16_t *gy,
                                           int gstride, int d0, int d1,
                                           int grad_prec_bits, int mv_prec_bits,
                                           int *vx0, int *vy0, int *vx1,
                                           int *vy1) {
   int bHeight = 8;
   const int rls_alpha = 4 * OPFL_RLS_PARAM;
   const int bits = mv_prec_bits + grad_prec_bits;
   __m128i u2, v2, uv, uw, vw;
   int32_t su2 = 0;
   int32_t sv2 = 0;
   int32_t suv = 0;
   int32_t suw = 0;
   int32_t svw = 0;
   // TODO(kslu) clean up all grad_bits if later it is still not needed
   int grad_bits = 0;
   do {
     __m128i gradX = LoadUnaligned16(gx);
     __m128i gradY = LoadUnaligned16(gy);
     __m128i pred = LoadUnaligned16(pdiff);
     // The precision of gx, gy and pred (i.e. d0*p0-d1*p1) buffers is signed
     // 16bit and there are cases where these buffers can be filled with extreme
     // values. Hence, the accumulation here needs to be done at 64bit to avoid
     // overflow issues.
     const __m128i gradX_lo_0 = _mm_cvtepi16_epi32(gradX);
     const __m128i gradY_lo_0 = _mm_cvtepi16_epi32(gradY);
     const __m128i pred_lo_0 = _mm_cvtepi16_epi32(pred);
     const __m128i gradX_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(gradX, 8));
     const __m128i gradY_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(gradY, 8));
     const __m128i pred_hi_0 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));

     const __m128i gradX_lo1 = _mm_srli_si128(gradX_lo_0, 4);
     const __m128i gradX_hi1 = _mm_srli_si128(gradX_hi_0, 4);
     const __m128i gradY_lo1 = _mm_srli_si128(gradY_lo_0, 4);
     const __m128i gradY_hi1 = _mm_srli_si128(gradY_hi_0, 4);
     const __m128i pred_lo1 = _mm_srli_si128(pred_lo_0, 4);
     const __m128i pred_hi1 = _mm_srli_si128(pred_hi_0, 4);
     __m128i t1, t2;

     multiply_and_accum(gradX_lo_0, gradX_lo_0, gradX_hi_0, gradX_hi_0,
                        gradX_lo1, gradX_lo1, gradX_hi1, gradX_hi1, grad_bits,
                        grad_bits, &t1, &t2);
     u2 = _mm_add_epi64(t1, t2);

     multiply_and_accum(gradY_lo_0, gradY_lo_0, gradY_hi_0, gradY_hi_0,
                        gradY_lo1, gradY_lo1, gradY_hi1, gradY_hi1, grad_bits,
                        grad_bits, &t1, &t2);
     v2 = _mm_add_epi64(t1, t2);

     multiply_and_accum(gradX_lo_0, gradY_lo_0, gradX_hi_0, gradY_hi_0,
                        gradX_lo1, gradY_lo1, gradX_hi1, gradY_hi1, grad_bits,
                        grad_bits, &t1, &t2);
     uv = _mm_add_epi64(t1, t2);

     multiply_and_accum(gradX_lo_0, pred_lo_0, gradX_hi_0, pred_hi_0, gradX_lo1,
                        pred_lo1, gradX_hi1, pred_hi1, grad_bits, grad_bits, &t1,
                        &t2);
     uw = _mm_add_epi64(t1, t2);

     multiply_and_accum(gradY_lo_0, pred_lo_0, gradY_hi_0, pred_hi_0, gradY_lo1,
                        pred_lo1, gradY_hi1, pred_hi1, grad_bits, grad_bits, &t1,
                        &t2);
     vw = _mm_add_epi64(t1, t2);
     gx += gstride;
     gy += gstride;
     pdiff += pstride;
     bHeight -= 1;
     int64_t temp;
     xx_storel_64(&temp, _mm_add_epi64(u2, _mm_srli_si128(u2, 8)));
     su2 += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(v2, _mm_srli_si128(v2, 8)));
     sv2 += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uv, _mm_srli_si128(uv, 8)));
     suv += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(uw, _mm_srli_si128(uw, 8)));
     suw += (int32_t)temp;
     xx_storel_64(&temp, _mm_add_epi64(vw, _mm_srli_si128(vw, 8)));
     svw += (int32_t)temp;
 #if !CONFIG_F107_GRADIENT_SIMPLIFY
     // For every 8 pixels, do a range check and add a downshift if range is
     // getting close to the max allowed bit depth
     if (get_msb_signed(AOMMAX(AOMMAX(su2, sv2), AOMMAX(abs(suw), abs(svw)))) >=
         MAX_OPFL_AUTOCORR_BITS - 2) {
       su2 = ROUND_POWER_OF_TWO_SIGNED(su2, 1);
       sv2 = ROUND_POWER_OF_TWO_SIGNED(sv2, 1);
       suv = ROUND_POWER_OF_TWO_SIGNED(suv, 1);
       suw = ROUND_POWER_OF_TWO_SIGNED(suw, 1);
       svw = ROUND_POWER_OF_TWO_SIGNED(svw, 1);
       grad_bits++;
     }
 #endif  // !CONFIG_F107_GRADIENT_SIMPLIFY
   } while (bHeight != 0);

   calc_mv_process(su2, sv2, suv, suw, svw, d0, d1, bits, rls_alpha, vx0, vy0,
                   vx1, vy1);
 }

 static AOM_INLINE void opfl_mv_refinement_sse4_1(
     const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy,
     int gstride, int bw, int bh, int d0, int d1, int grad_prec_bits,
     int mv_prec_bits, int *vx0, int *vy0, int *vx1, int *vy1) {
   (void)bh;
   if (bw == 4)
     opfl_mv_refinement_8x4_sse4_1(pdiff, pstride, gx, gy, gstride, d0, d1,
                                   grad_prec_bits, mv_prec_bits, vx0, vy0, vx1,
                                   vy1);
   else
     opfl_mv_refinement_8x8_sse4_1(pdiff, pstride, gx, gy, gstride, d0, d1,
                                   grad_prec_bits, mv_prec_bits, vx0, vy0, vx1,
                                   vy1);
 }

 // Function to compute optical flow offsets in nxn blocks
 int av1_opfl_mv_refinement_nxn_sse4_1(
     const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy,
     int gstride, int bw, int bh, int n, int d0, int d1, int grad_prec_bits,
     int mv_prec_bits, int mi_x, int mi_y, int mi_cols, int mi_rows,
     int build_for_decode, int *vx0, int *vy0, int *vx1, int *vy1) {
   assert(bw % n == 0 && bh % n == 0);
   int n_blocks = 0;
   const int num_blocks = (n == 4) ? 2 : 1;
   for (int i = 0; i < bh; i += n) {
     for (int j = 0; j < bw; j += 8) {
       if (is_subblock_outside(mi_x + j, mi_y + i, mi_cols, mi_rows,
                               build_for_decode)) {
         n_blocks += num_blocks;
         continue;
       }
       opfl_mv_refinement_sse4_1(pdiff + (i * pstride + j), pstride,
                                 gx + (i * gstride + j), gy + (i * gstride + j),
                                 gstride, n, n, d0, d1, grad_prec_bits,
                                 mv_prec_bits, vx0 + n_blocks, vy0 + n_blocks,
                                 vx1 + n_blocks, vy1 + n_blocks);
       n_blocks += (n == 4) ? 2 : 1;
     }
   }
   return n_blocks;
 }

 // This round shift function has only been tested for the case d0 = 1, d1 = -1
 // that is used in optical flow MV search. To use centered=1 option for more
 // general d0 and d1, this function needs to be extended.
 static INLINE __m128i round_power_of_two_signed_epi16(__m128i temp1,
                                                       const int bits) {
   __m128i ones = _mm_set1_epi16(1);
   __m128i v_sign_d = _mm_sign_epi16(ones, temp1);
   __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
   __m128i reg = _mm_mullo_epi16(temp1, v_sign_d);
   reg = _mm_srli_epi16(_mm_add_epi16(reg, v_bias_d), bits);
   return _mm_mullo_epi16(reg, v_sign_d);
 }

 static AOM_FORCE_INLINE void compute_pred_using_interp_grad_highbd_sse4_1(
     const uint16_t *src1, const uint16_t *src2, int src_stride, int16_t *dst1,
     int16_t *dst2, int bw, int bh, int d0, int d1, int bd, int centered) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i mul1 = _mm_set1_epi16(d0);
   const __m128i mul2 = _mm_sub_epi16(zero, _mm_set1_epi16(d1));
   const __m128i mul_val1 = _mm_unpacklo_epi16(mul1, mul2);

   for (int i = 0; i < bh; i++) {
     const uint16_t *inp1 = src1 + i * src_stride;
     const uint16_t *inp2 = src2 + i * src_stride;
     int16_t *out1 = dst1 + i * bw;
     int16_t *out2 = dst2 ? (dst2 + i * bw) : NULL;
     for (int j = 0; j < bw; j = j + 8) {
       const __m128i src_buf1 = xx_load_128(inp1 + j);
       const __m128i src_buf2 = xx_load_128(inp2 + j);

       __m128i temp1, temp2;
       __m128i reg1 = _mm_unpacklo_epi16(src_buf1, src_buf2);
       __m128i reg2 = _mm_unpackhi_epi16(src_buf1, src_buf2);

       temp1 = _mm_madd_epi16(reg1, mul_val1);
       temp2 = _mm_madd_epi16(reg2, mul_val1);
       temp1 = _mm_packs_epi32(temp1, temp2);
       if (centered) temp1 = round_power_of_two_signed_epi16(temp1, 1);
       temp1 = round_power_of_two_signed_epi16(temp1, bd - 8);
       temp1 = clamp_epi16(temp1, -OPFL_PRED_MAX, OPFL_PRED_MAX);
       xx_store_128(out1 + j, temp1);

       if (dst2) {
         // src_bufs are pixels up to 12 bits. So subtraction should not
         // overflow.
         temp2 = _mm_sub_epi16(src_buf1, src_buf2);
         temp2 = round_power_of_two_signed_epi16(temp2, bd - 8);
         temp2 = clamp_epi16(temp2, -OPFL_PRED_MAX, OPFL_PRED_MAX);
         xx_store_128(out2 + j, temp2);
       }
     }
   }
 }

 void av1_copy_pred_array_highbd_sse4_1(const uint16_t *src1,
                                        const uint16_t *src2, int src_stride,
                                        int16_t *dst1, int16_t *dst2, int bw,
                                        int bh, int d0, int d1, int bd,
                                        int centered) {
   compute_pred_using_interp_grad_highbd_sse4_1(
       src1, src2, src_stride, dst1, dst2, bw, bh, d0, d1, bd, centered);
 }