av1/common/x86/av1_convolve_scale_sse4.c - aom - Git at Google

 /*
  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <assert.h>
 #include <smmintrin.h>

 #include "config/aom_dsp_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"

 // A specialised version of hfilter, the horizontal filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
 static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
                      int h, int subpel_x_qn, int x_step_qn,
                      const InterpFilterParams *filter_params, unsigned round) {
   const int bd = 8;
   const int ntaps = 8;

   src -= ntaps / 2 - 1;

   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
   const __m128i round_shift = _mm_cvtsi32_si128(round);

   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
     const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

     // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
     const __m128i zero = _mm_castps_si128(_mm_setzero_ps());

     int y;
     for (y = 0; y <= h - 4; y += 4) {
       const uint8_t *const src0 = src_col + y * src_stride;
       const uint8_t *const src1 = src0 + 1 * src_stride;
       const uint8_t *const src2 = src0 + 2 * src_stride;
       const uint8_t *const src3 = src0 + 3 * src_stride;

       // Load up source data. This is 8-bit input data; each load is just
       // loading the lower half of the register and gets 8 pixels
       const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
       const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
       const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
       const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);

       // Now zero-extend up to 16-bit precision by interleaving with
       // zeros. Drop the upper half of each register (which just had zeros)
       const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
       const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
       const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
       const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);

       // Multiply by coefficients
       const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
       const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
       const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
       const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);

       // Reduce horizontally and add
       const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
       const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
       const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);

       // Divide down by (1 << round), rounding to nearest.
       __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);

       shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
       _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint8_t *const src_row = src_col + y * src_stride;

       int32_t sum = (1 << (bd + FILTER_BITS - 1));
       for (int k = 0; k < ntaps; ++k) {
         sum += filter[k] * src_row[k];
       }

       dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
     }
   }
 }

 static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
   __m128i data = _mm_loadu_si128((__m128i *)src);
   return _mm_madd_epi16(data, coeff);
 }

 // A specialised version of vfilter, the vertical filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
 static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
                      int dst_stride, int w, int h, int subpel_y_qn,
                      int y_step_qn, const InterpFilterParams *filter_params,
                      const ConvolveParams *conv_params, int bd) {
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;

   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);

   const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
                          (1 << (offset_bits - conv_params->round_1 - 1)));
   const __m128i sub = _mm_set1_epi16(sub32);

   CONV_BUF_TYPE *dst16 = conv_params->dst;
   const int dst16_stride = conv_params->dst_stride;
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const __m128i bits_shift = _mm_cvtsi32_si128(bits);
   const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
   const __m128i round_shift_add =
       _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);

   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi16(w0);
   const __m128i wt1 = _mm_set1_epi16(w1);
   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);

   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
     int x;
     for (x = 0; x <= w - 4; x += 4) {
       const int16_t *const src0 = src_y + x * src_stride;
       const int16_t *const src1 = src0 + 1 * src_stride;
       const int16_t *const src2 = src0 + 2 * src_stride;
       const int16_t *const src3 = src0 + 3 * src_stride;

       // Load the source data for the three rows, adding the three registers of
       // convolved products to one as we go (conv0..conv3) to avoid the
       // register pressure getting too high.
       const __m128i conv0 = convolve_16_8(src0, coeff0716);
       const __m128i conv1 = convolve_16_8(src1, coeff0716);
       const __m128i conv2 = convolve_16_8(src2, coeff0716);
       const __m128i conv3 = convolve_16_8(src3, coeff0716);

       // Now reduce horizontally to get one lane for each result
       const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
       const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
       __m128i conv = _mm_hadd_epi32(conv01, conv23);

       conv = _mm_add_epi32(conv, res_add_const);
       // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
       __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);

       uint8_t *dst_x = dst + y * dst_stride + x;
       CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
       __m128i result;
       __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);

       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
           if (conv_params->use_dist_wtd_comp_avg) {
             const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
             const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
             const __m128i shifted_32 =
                 _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
             shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
           } else {
             shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
           }
           const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
           result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
           const __m128i result_8 = _mm_packus_epi16(result, result);
           *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
         } else {
           _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
         }
       } else {
         const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
         result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
         const __m128i result_8 = _mm_packus_epi16(result, result);
         *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
       }
     }
     for (; x < w; ++x) {
       const int16_t *src_x = src_y + x * src_stride;
       int32_t sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);

       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
           if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
             tmp += res;
             tmp = tmp >> 1;
           }
           /* Subtract round offset and convolve round */
           tmp = tmp - sub32;
           dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
         } else {
           dst16[y * dst16_stride + x] = res;
         }
       } else {
         /* Subtract round offset and convolve round */
         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
                              (1 << (offset_bits - conv_params->round_1 - 1)));
         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
       }
     }
   }
 }
 void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
                                   uint8_t *dst8, int dst8_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
                                   const InterpFilterParams *filter_params_y,
                                   const int subpel_x_qn, const int x_step_qn,
                                   const int subpel_y_qn, const int y_step_qn,
                                   ConvolveParams *conv_params) {
   // TODO(yaowu): remove unnecessary initializations
   int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;

   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
   const int fo_vert = ytaps / 2 - 1;
   assert((xtaps == 8) && (ytaps == 8));
   (void)xtaps;

   // horizontal filter
   hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
            x_step_qn, filter_params_x, conv_params->round_0);

   // vertical filter (input is transposed)
   vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
            filter_params_y, conv_params, 8);
 }

 // A specialised version of hfilter, the horizontal filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
 static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
                             int w, int h, int subpel_x_qn, int x_step_qn,
                             const InterpFilterParams *filter_params,
                             unsigned round, int bd) {
   const int ntaps = 8;

   src -= ntaps / 2 - 1;

   int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
   const __m128i round_add = _mm_set1_epi32(round_add32);
   const __m128i round_shift = _mm_cvtsi32_si128(round);

   int x_qn = subpel_x_qn;
   for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
     const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

     // Load the filter coefficients
     const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);

     int y;
     for (y = 0; y <= h - 4; y += 4) {
       const uint16_t *const src0 = src_col + y * src_stride;
       const uint16_t *const src1 = src0 + 1 * src_stride;
       const uint16_t *const src2 = src0 + 2 * src_stride;
       const uint16_t *const src3 = src0 + 3 * src_stride;

       // Load up source data. This is 16-bit input data, so each load gets the 8
       // pixels we need.
       const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
       const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
       const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
       const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);

       // Multiply by coefficients
       const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
       const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
       const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
       const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);

       // Reduce horizontally and add
       const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
       const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
       const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);

       // Divide down by (1 << round), rounding to nearest.
       __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);

       shifted = _mm_packus_epi32(shifted, shifted);
       // Write transposed to the output
       _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
     }
     for (; y < h; ++y) {
       const uint16_t *const src_row = src_col + y * src_stride;

       int32_t sum = (1 << (bd + FILTER_BITS - 1));
       for (int k = 0; k < ntaps; ++k) {
         sum += filter[k] * src_row[k];
       }

       dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
     }
   }
 }
 // A specialised version of vfilter, the vertical filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
 static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
                             int dst_stride, int w, int h, int subpel_y_qn,
                             int y_step_qn,
                             const InterpFilterParams *filter_params,
                             const ConvolveParams *conv_params, int bd) {
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   const int ntaps = 8;

   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);

   const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
                          (1 << (offset_bits - conv_params->round_1 - 1)));
   const __m128i sub = _mm_set1_epi32(sub32);

   CONV_BUF_TYPE *dst16 = conv_params->dst;
   const int dst16_stride = conv_params->dst_stride;
   const __m128i clip_pixel_ =
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const int bits =
       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
   const __m128i bits_shift = _mm_cvtsi32_si128(bits);
   const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
   const __m128i round_shift_add =
       _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
   const int round_bits =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
   __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));

   const int w0 = conv_params->fwd_offset;
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set1_epi32(w0);
   const __m128i wt1 = _mm_set1_epi32(w1);

   int y_qn = subpel_y_qn;
   for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
     const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
     const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
     assert(filter_idx < SUBPEL_SHIFTS);
     const int16_t *filter =
         av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

     const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
     int x;
     for (x = 0; x <= w - 4; x += 4) {
       const int16_t *const src0 = src_y + x * src_stride;
       const int16_t *const src1 = src0 + 1 * src_stride;
       const int16_t *const src2 = src0 + 2 * src_stride;
       const int16_t *const src3 = src0 + 3 * src_stride;

       // Load the source data for the three rows, adding the three registers of
       // convolved products to one as we go (conv0..conv3) to avoid the
       // register pressure getting too high.
       const __m128i conv0 = convolve_16_8(src0, coeff0716);
       const __m128i conv1 = convolve_16_8(src1, coeff0716);
       const __m128i conv2 = convolve_16_8(src2, coeff0716);
       const __m128i conv3 = convolve_16_8(src3, coeff0716);

       // Now reduce horizontally to get one lane for each result
       const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
       const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
       __m128i conv = _mm_hadd_epi32(conv01, conv23);
       conv = _mm_add_epi32(conv, res_add_const);

       // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
       __m128i shifted =
           _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);

       uint16_t *dst_x = dst + y * dst_stride + x;
       CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;

       __m128i result;
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           __m128i p_32 =
               _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));

           if (conv_params->use_dist_wtd_comp_avg) {
             shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
                                     _mm_mullo_epi32(shifted, wt1));
             shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
           } else {
             shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
           }
           __m128i res32 = _mm_sub_epi32(shifted, sub);
           res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
                                 round_bits_shift);

           __m128i res16 = _mm_packus_epi32(res32, res32);
           res16 = _mm_min_epi16(res16, clip_pixel_);
           _mm_storel_epi64((__m128i *)dst_x, res16);
         } else {
           __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
           _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
         }
       } else {
         const __m128i subbed = _mm_sub_epi32(shifted, sub);
         result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
         result = _mm_packus_epi32(result, result);
         result = _mm_min_epi16(result, clip_pixel_);
         _mm_storel_epi64((__m128i *)dst_x, result);
       }
     }

     for (; x < w; ++x) {
       const int16_t *src_x = src_y + x * src_stride;
       int32_t sum = 1 << offset_bits;
       for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
       if (conv_params->is_compound) {
         if (conv_params->do_average) {
           int32_t tmp = dst16[y * dst16_stride + x];
           if (conv_params->use_dist_wtd_comp_avg) {
             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
             tmp = tmp >> DIST_PRECISION_BITS;
           } else {
             tmp += res;
             tmp = tmp >> 1;
           }
           /* Subtract round offset and convolve round */
           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
                        (1 << (offset_bits - conv_params->round_1 - 1)));
           dst[y * dst_stride + x] =
               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
         } else {
           dst16[y * dst16_stride + x] = res;
         }
       } else {
         /* Subtract round offset and convolve round */
         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
                              (1 << (offset_bits - conv_params->round_1 - 1)));
         dst[y * dst_stride + x] =
             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
       }
     }
   }
 }

 void av1_highbd_convolve_2d_scale_sse4_1(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
     ConvolveParams *conv_params, int bd) {
   // TODO(yaowu): Move this out of stack
   DECLARE_ALIGNED(16, int16_t,
                   tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
              filter_params_y->taps;
   const int xtaps = filter_params_x->taps;
   const int ytaps = filter_params_y->taps;
   const int fo_vert = ytaps / 2 - 1;

   memset(tmp, 0, sizeof(tmp));
   assert((xtaps == 8) && (ytaps == 8));
   (void)xtaps;

   // horizontal filter
   highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
                   subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
                   bd);

   // vertical filter (input is transposed)
   highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
                   filter_params_y, conv_params, bd);
 }
	/*
	* Copyright (c) 2017, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <assert.h>
	#include <smmintrin.h>

	#include "config/aom_dsp_rtcd.h"

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_dsp/aom_filter.h"
	#include "av1/common/convolve.h"

	// A specialised version of hfilter, the horizontal filter for
	// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
	static void hfilter8(const uint8_t src, int src_stride, int16_t dst, int w,
	int h, int subpel_x_qn, int x_step_qn,
	const InterpFilterParams *filter_params, unsigned round) {
	const int bd = 8;
	const int ntaps = 8;

	src -= ntaps / 2 - 1;

	int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
	const __m128i round_add = _mm_set1_epi32(round_add32);
	const __m128i round_shift = _mm_cvtsi32_si128(round);

	int x_qn = subpel_x_qn;
	for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
	const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
	const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
	assert(filter_idx < SUBPEL_SHIFTS);
	const int16_t *filter =
	av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

	// Load the filter coefficients
	const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
	const __m128i zero = _mm_castps_si128(_mm_setzero_ps());

	int y;
	for (y = 0; y <= h - 4; y += 4) {
	const uint8_t const src0 = src_col + y src_stride;
	const uint8_t const src1 = src0 + 1 src_stride;
	const uint8_t const src2 = src0 + 2 src_stride;
	const uint8_t const src3 = src0 + 3 src_stride;

	// Load up source data. This is 8-bit input data; each load is just
	// loading the lower half of the register and gets 8 pixels
	const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
	const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
	const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
	const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);

	// Now zero-extend up to 16-bit precision by interleaving with
	// zeros. Drop the upper half of each register (which just had zeros)
	const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
	const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
	const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
	const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);

	// Multiply by coefficients
	const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
	const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
	const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
	const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);

	// Reduce horizontally and add
	const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
	const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
	const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);

	// Divide down by (1 << round), rounding to nearest.
	__m128i shifted =
	_mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);

	shifted = _mm_packus_epi32(shifted, shifted);
	// Write transposed to the output
	_mm_storel_epi64((__m128i )(dst + y + x h), shifted);
	}
	for (; y < h; ++y) {
	const uint8_t const src_row = src_col + y src_stride;

	int32_t sum = (1 << (bd + FILTER_BITS - 1));
	for (int k = 0; k < ntaps; ++k) {
	sum += filter[k] * src_row[k];
	}

	dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
	}
	}
	}

	static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
	__m128i data = _mm_loadu_si128((__m128i *)src);
	return _mm_madd_epi16(data, coeff);
	}

	// A specialised version of vfilter, the vertical filter for
	// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
	static void vfilter8(const int16_t src, int src_stride, uint8_t dst,
	int dst_stride, int w, int h, int subpel_y_qn,
	int y_step_qn, const InterpFilterParams *filter_params,
	const ConvolveParams *conv_params, int bd) {
	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
	const int ntaps = 8;

	const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);

	const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
	(1 << (offset_bits - conv_params->round_1 - 1)));
	const __m128i sub = _mm_set1_epi16(sub32);

	CONV_BUF_TYPE *dst16 = conv_params->dst;
	const int dst16_stride = conv_params->dst_stride;
	const int bits =
	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
	const __m128i bits_shift = _mm_cvtsi32_si128(bits);
	const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
	const __m128i round_shift_add =
	_mm_set1_epi32(((1 << conv_params->round_1) >> 1));
	const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);

	const int w0 = conv_params->fwd_offset;
	const int w1 = conv_params->bck_offset;
	const __m128i wt0 = _mm_set1_epi16(w0);
	const __m128i wt1 = _mm_set1_epi16(w1);
	const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);

	int y_qn = subpel_y_qn;
	for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
	const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
	const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
	assert(filter_idx < SUBPEL_SHIFTS);
	const int16_t *filter =
	av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

	const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
	int x;
	for (x = 0; x <= w - 4; x += 4) {
	const int16_t const src0 = src_y + x src_stride;
	const int16_t const src1 = src0 + 1 src_stride;
	const int16_t const src2 = src0 + 2 src_stride;
	const int16_t const src3 = src0 + 3 src_stride;

	// Load the source data for the three rows, adding the three registers of
	// convolved products to one as we go (conv0..conv3) to avoid the
	// register pressure getting too high.
	const __m128i conv0 = convolve_16_8(src0, coeff0716);
	const __m128i conv1 = convolve_16_8(src1, coeff0716);
	const __m128i conv2 = convolve_16_8(src2, coeff0716);
	const __m128i conv3 = convolve_16_8(src3, coeff0716);

	// Now reduce horizontally to get one lane for each result
	const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
	const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
	__m128i conv = _mm_hadd_epi32(conv01, conv23);

	conv = _mm_add_epi32(conv, res_add_const);
	// Divide down by (1 << round_1), rounding to nearest and subtract sub32.
	__m128i shifted =
	_mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);

	uint8_t dst_x = dst + y dst_stride + x;
	CONV_BUF_TYPE dst_16_x = dst16 + y dst16_stride + x;
	__m128i result;
	__m128i shifted_16 = _mm_packus_epi32(shifted, shifted);

	if (conv_params->is_compound) {
	if (conv_params->do_average) {
	const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
	if (conv_params->use_dist_wtd_comp_avg) {
	const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
	const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
	const __m128i shifted_32 =
	_mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
	shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
	} else {
	shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
	}
	const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
	result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
	const __m128i result_8 = _mm_packus_epi16(result, result);
	(uint32_t )dst_x = _mm_cvtsi128_si32(result_8);
	} else {
	_mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
	}
	} else {
	const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
	result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
	const __m128i result_8 = _mm_packus_epi16(result, result);
	(uint32_t )dst_x = _mm_cvtsi128_si32(result_8);
	}
	}
	for (; x < w; ++x) {
	const int16_t src_x = src_y + x src_stride;
	int32_t sum = 1 << offset_bits;
	for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
	CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);

	if (conv_params->is_compound) {
	if (conv_params->do_average) {
	int32_t tmp = dst16[y * dst16_stride + x];
	if (conv_params->use_dist_wtd_comp_avg) {
	tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
	tmp = tmp >> DIST_PRECISION_BITS;
	} else {
	tmp += res;
	tmp = tmp >> 1;
	}
	/* Subtract round offset and convolve round */
	tmp = tmp - sub32;
	dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
	} else {
	dst16[y * dst16_stride + x] = res;
	}
	} else {
	/* Subtract round offset and convolve round */
	int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
	(1 << (offset_bits - conv_params->round_1 - 1)));
	dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
	}
	}
	}
	}
	void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
	uint8_t *dst8, int dst8_stride, int w, int h,
	const InterpFilterParams *filter_params_x,
	const InterpFilterParams *filter_params_y,
	const int subpel_x_qn, const int x_step_qn,
	const int subpel_y_qn, const int y_step_qn,
	ConvolveParams *conv_params) {
	// TODO(yaowu): remove unnecessary initializations
	int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
	int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
	filter_params_y->taps;

	const int xtaps = filter_params_x->taps;
	const int ytaps = filter_params_y->taps;
	const int fo_vert = ytaps / 2 - 1;
	assert((xtaps == 8) && (ytaps == 8));
	(void)xtaps;

	// horizontal filter
	hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
	x_step_qn, filter_params_x, conv_params->round_0);

	// vertical filter (input is transposed)
	vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
	filter_params_y, conv_params, 8);
	}

	// A specialised version of hfilter, the horizontal filter for
	// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
	// filters.
	static void highbd_hfilter8(const uint16_t src, int src_stride, int16_t dst,
	int w, int h, int subpel_x_qn, int x_step_qn,
	const InterpFilterParams *filter_params,
	unsigned round, int bd) {
	const int ntaps = 8;

	src -= ntaps / 2 - 1;

	int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
	const __m128i round_add = _mm_set1_epi32(round_add32);
	const __m128i round_shift = _mm_cvtsi32_si128(round);

	int x_qn = subpel_x_qn;
	for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
	const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
	const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
	assert(filter_idx < SUBPEL_SHIFTS);
	const int16_t *filter =
	av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

	// Load the filter coefficients
	const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);

	int y;
	for (y = 0; y <= h - 4; y += 4) {
	const uint16_t const src0 = src_col + y src_stride;
	const uint16_t const src1 = src0 + 1 src_stride;
	const uint16_t const src2 = src0 + 2 src_stride;
	const uint16_t const src3 = src0 + 3 src_stride;

	// Load up source data. This is 16-bit input data, so each load gets the 8
	// pixels we need.
	const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
	const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
	const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
	const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);

	// Multiply by coefficients
	const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
	const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
	const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
	const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);

	// Reduce horizontally and add
	const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
	const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
	const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);

	// Divide down by (1 << round), rounding to nearest.
	__m128i shifted =
	_mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);

	shifted = _mm_packus_epi32(shifted, shifted);
	// Write transposed to the output
	_mm_storel_epi64((__m128i )(dst + y + x h), shifted);
	}
	for (; y < h; ++y) {
	const uint16_t const src_row = src_col + y src_stride;

	int32_t sum = (1 << (bd + FILTER_BITS - 1));
	for (int k = 0; k < ntaps; ++k) {
	sum += filter[k] * src_row[k];
	}

	dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
	}
	}
	}
	// A specialised version of vfilter, the vertical filter for
	// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
	// filters.
	static void highbd_vfilter8(const int16_t src, int src_stride, uint16_t dst,
	int dst_stride, int w, int h, int subpel_y_qn,
	int y_step_qn,
	const InterpFilterParams *filter_params,
	const ConvolveParams *conv_params, int bd) {
	const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
	const int ntaps = 8;

	const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);

	const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
	(1 << (offset_bits - conv_params->round_1 - 1)));
	const __m128i sub = _mm_set1_epi32(sub32);

	CONV_BUF_TYPE *dst16 = conv_params->dst;
	const int dst16_stride = conv_params->dst_stride;
	const __m128i clip_pixel_ =
	_mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
	const int bits =
	FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
	const __m128i bits_shift = _mm_cvtsi32_si128(bits);
	const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
	const __m128i round_shift_add =
	_mm_set1_epi32(((1 << conv_params->round_1) >> 1));
	const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
	const int round_bits =
	2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
	__m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
	__m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));

	const int w0 = conv_params->fwd_offset;
	const int w1 = conv_params->bck_offset;
	const __m128i wt0 = _mm_set1_epi32(w0);
	const __m128i wt1 = _mm_set1_epi32(w1);

	int y_qn = subpel_y_qn;
	for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
	const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
	const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
	assert(filter_idx < SUBPEL_SHIFTS);
	const int16_t *filter =
	av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);

	const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
	int x;
	for (x = 0; x <= w - 4; x += 4) {
	const int16_t const src0 = src_y + x src_stride;
	const int16_t const src1 = src0 + 1 src_stride;
	const int16_t const src2 = src0 + 2 src_stride;
	const int16_t const src3 = src0 + 3 src_stride;

	// Load the source data for the three rows, adding the three registers of
	// convolved products to one as we go (conv0..conv3) to avoid the
	// register pressure getting too high.
	const __m128i conv0 = convolve_16_8(src0, coeff0716);
	const __m128i conv1 = convolve_16_8(src1, coeff0716);
	const __m128i conv2 = convolve_16_8(src2, coeff0716);
	const __m128i conv3 = convolve_16_8(src3, coeff0716);

	// Now reduce horizontally to get one lane for each result
	const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
	const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
	__m128i conv = _mm_hadd_epi32(conv01, conv23);
	conv = _mm_add_epi32(conv, res_add_const);

	// Divide down by (1 << round_1), rounding to nearest and subtract sub32.
	__m128i shifted =
	_mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);

	uint16_t dst_x = dst + y dst_stride + x;
	CONV_BUF_TYPE dst_16_x = dst16 + y dst16_stride + x;

	__m128i result;
	if (conv_params->is_compound) {
	if (conv_params->do_average) {
	__m128i p_32 =
	_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));

	if (conv_params->use_dist_wtd_comp_avg) {
	shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
	_mm_mullo_epi32(shifted, wt1));
	shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
	} else {
	shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
	}
	__m128i res32 = _mm_sub_epi32(shifted, sub);
	res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
	round_bits_shift);

	__m128i res16 = _mm_packus_epi32(res32, res32);
	res16 = _mm_min_epi16(res16, clip_pixel_);
	_mm_storel_epi64((__m128i *)dst_x, res16);
	} else {
	__m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
	_mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
	}
	} else {
	const __m128i subbed = _mm_sub_epi32(shifted, sub);
	result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
	result = _mm_packus_epi32(result, result);
	result = _mm_min_epi16(result, clip_pixel_);
	_mm_storel_epi64((__m128i *)dst_x, result);
	}
	}

	for (; x < w; ++x) {
	const int16_t src_x = src_y + x src_stride;
	int32_t sum = 1 << offset_bits;
	for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
	CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
	if (conv_params->is_compound) {
	if (conv_params->do_average) {
	int32_t tmp = dst16[y * dst16_stride + x];
	if (conv_params->use_dist_wtd_comp_avg) {
	tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
	tmp = tmp >> DIST_PRECISION_BITS;
	} else {
	tmp += res;
	tmp = tmp >> 1;
	}
	/* Subtract round offset and convolve round */
	tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
	(1 << (offset_bits - conv_params->round_1 - 1)));
	dst[y * dst_stride + x] =
	clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
	} else {
	dst16[y * dst16_stride + x] = res;
	}
	} else {
	/* Subtract round offset and convolve round */
	int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
	(1 << (offset_bits - conv_params->round_1 - 1)));
	dst[y * dst_stride + x] =
	clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
	}
	}
	}
	}

	void av1_highbd_convolve_2d_scale_sse4_1(
	const uint16_t src, int src_stride, uint16_t dst, int dst_stride, int w,
	int h, const InterpFilterParams *filter_params_x,
	const InterpFilterParams *filter_params_y, const int subpel_x_qn,
	const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
	ConvolveParams *conv_params, int bd) {
	// TODO(yaowu): Move this out of stack
	DECLARE_ALIGNED(16, int16_t,
	tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
	int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
	filter_params_y->taps;
	const int xtaps = filter_params_x->taps;
	const int ytaps = filter_params_y->taps;
	const int fo_vert = ytaps / 2 - 1;

	memset(tmp, 0, sizeof(tmp));
	assert((xtaps == 8) && (ytaps == 8));
	(void)xtaps;

	// horizontal filter
	highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
	subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
	bd);

	// vertical filter (input is transposed)
	highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
	filter_params_y, conv_params, bd);
	}