| /* | 
 |  * Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
 |  * | 
 |  * This source code is subject to the terms of the BSD 2 Clause License and | 
 |  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 |  * was not distributed with this source code in the LICENSE file, you can | 
 |  * obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 |  * Media Patent License 1.0 was not distributed with this source code in the | 
 |  * PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 |  */ | 
 |  | 
 | #include <assert.h> | 
 | #include <smmintrin.h> | 
 |  | 
 | #include "./av1_rtcd.h" | 
 | #include "av1/common/filter.h" | 
 | #include "av1/common/x86/av1_highbd_convolve_filters_sse4.h" | 
 |  | 
 | typedef const int16_t (*HbdSubpelFilterCoeffs)[8]; | 
 |  | 
 | typedef void (*TransposeSave)(const int width, int pixelsNum, uint32_t *src, | 
 |                               int src_stride, uint16_t *dst, int dst_stride, | 
 |                               int bd); | 
 |  | 
 | static INLINE HbdSubpelFilterCoeffs | 
 | hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) { | 
 | #if CONFIG_DUAL_FILTER | 
 |   if (p.interp_filter == MULTITAP_SHARP) { | 
 |     return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0]; | 
 |   } | 
 | #endif | 
 | #if USE_TEMPORALFILTER_12TAP | 
 |   if (p.interp_filter == TEMPORALFILTER_12TAP) { | 
 |     return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0]; | 
 |   } | 
 | #endif | 
 |   (void)p; | 
 |   (void)index; | 
 |   return NULL; | 
 | } | 
 |  | 
 | // pixelsNum 0: write all 4 pixels | 
 | //           1/2/3: residual pixels 1/2/3 | 
 | static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, | 
 |                        int dst_stride) { | 
 |   if (2 == width) { | 
 |     if (0 == pixelsNum) { | 
 |       *(int *)dst = _mm_cvtsi128_si32(u[0]); | 
 |       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); | 
 |       *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); | 
 |       *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]); | 
 |     } else if (1 == pixelsNum) { | 
 |       *(int *)dst = _mm_cvtsi128_si32(u[0]); | 
 |     } else if (2 == pixelsNum) { | 
 |       *(int *)dst = _mm_cvtsi128_si32(u[0]); | 
 |       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); | 
 |     } else if (3 == pixelsNum) { | 
 |       *(int *)dst = _mm_cvtsi128_si32(u[0]); | 
 |       *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); | 
 |       *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); | 
 |     } | 
 |   } else { | 
 |     if (0 == pixelsNum) { | 
 |       _mm_storel_epi64((__m128i *)dst, u[0]); | 
 |       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); | 
 |       _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); | 
 |       _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]); | 
 |     } else if (1 == pixelsNum) { | 
 |       _mm_storel_epi64((__m128i *)dst, u[0]); | 
 |     } else if (2 == pixelsNum) { | 
 |       _mm_storel_epi64((__m128i *)dst, u[0]); | 
 |       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); | 
 |     } else if (3 == pixelsNum) { | 
 |       _mm_storel_epi64((__m128i *)dst, u[0]); | 
 |       _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); | 
 |       _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | // 16-bit pixels clip with bd (10/12) | 
 | static void highbd_clip(__m128i *p, int numVecs, int bd) { | 
 |   const __m128i zero = _mm_setzero_si128(); | 
 |   const __m128i one = _mm_set1_epi16(1); | 
 |   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); | 
 |   __m128i clamped, mask; | 
 |   int i; | 
 |  | 
 |   for (i = 0; i < numVecs; i++) { | 
 |     mask = _mm_cmpgt_epi16(p[i], max); | 
 |     clamped = _mm_andnot_si128(mask, p[i]); | 
 |     mask = _mm_and_si128(mask, max); | 
 |     clamped = _mm_or_si128(mask, clamped); | 
 |     mask = _mm_cmpgt_epi16(clamped, zero); | 
 |     p[i] = _mm_and_si128(clamped, mask); | 
 |   } | 
 | } | 
 |  | 
 | static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { | 
 |   __m128i v0, v1; | 
 |   __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); | 
 |  | 
 |   u[0] = _mm_loadu_si128((__m128i const *)src); | 
 |   u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); | 
 |   u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); | 
 |   u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); | 
 |  | 
 |   u[0] = _mm_add_epi32(u[0], rnd); | 
 |   u[1] = _mm_add_epi32(u[1], rnd); | 
 |   u[2] = _mm_add_epi32(u[2], rnd); | 
 |   u[3] = _mm_add_epi32(u[3], rnd); | 
 |  | 
 |   u[0] = _mm_srai_epi32(u[0], FILTER_BITS); | 
 |   u[1] = _mm_srai_epi32(u[1], FILTER_BITS); | 
 |   u[2] = _mm_srai_epi32(u[2], FILTER_BITS); | 
 |   u[3] = _mm_srai_epi32(u[3], FILTER_BITS); | 
 |  | 
 |   u[0] = _mm_packus_epi32(u[0], u[1]); | 
 |   u[1] = _mm_packus_epi32(u[2], u[3]); | 
 |  | 
 |   highbd_clip(u, 2, bd); | 
 |  | 
 |   v0 = _mm_unpacklo_epi16(u[0], u[1]); | 
 |   v1 = _mm_unpackhi_epi16(u[0], u[1]); | 
 |  | 
 |   u[0] = _mm_unpacklo_epi16(v0, v1); | 
 |   u[2] = _mm_unpackhi_epi16(v0, v1); | 
 |  | 
 |   u[1] = _mm_srli_si128(u[0], 8); | 
 |   u[3] = _mm_srli_si128(u[2], 8); | 
 | } | 
 |  | 
 | // pixelsNum = 0     : all 4 rows of pixels will be saved. | 
 | // pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved. | 
 | void trans_save_4x4(const int width, int pixelsNum, uint32_t *src, | 
 |                     int src_stride, uint16_t *dst, int dst_stride, int bd) { | 
 |   __m128i u[4]; | 
 |   transClipPixel(src, src_stride, u, bd); | 
 |   writePixel(u, width, pixelsNum, dst, dst_stride); | 
 | } | 
 |  | 
 | void trans_accum_save_4x4(const int width, int pixelsNum, uint32_t *src, | 
 |                           int src_stride, uint16_t *dst, int dst_stride, | 
 |                           int bd) { | 
 |   __m128i u[4], v[4]; | 
 |   const __m128i ones = _mm_set1_epi16(1); | 
 |  | 
 |   transClipPixel(src, src_stride, u, bd); | 
 |  | 
 |   v[0] = _mm_loadl_epi64((__m128i const *)dst); | 
 |   v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride)); | 
 |   v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride)); | 
 |   v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride)); | 
 |  | 
 |   u[0] = _mm_add_epi16(u[0], v[0]); | 
 |   u[1] = _mm_add_epi16(u[1], v[1]); | 
 |   u[2] = _mm_add_epi16(u[2], v[2]); | 
 |   u[3] = _mm_add_epi16(u[3], v[3]); | 
 |  | 
 |   u[0] = _mm_add_epi16(u[0], ones); | 
 |   u[1] = _mm_add_epi16(u[1], ones); | 
 |   u[2] = _mm_add_epi16(u[2], ones); | 
 |   u[3] = _mm_add_epi16(u[3], ones); | 
 |  | 
 |   u[0] = _mm_srai_epi16(u[0], 1); | 
 |   u[1] = _mm_srai_epi16(u[1], 1); | 
 |   u[2] = _mm_srai_epi16(u[2], 1); | 
 |   u[3] = _mm_srai_epi16(u[3], 1); | 
 |  | 
 |   writePixel(u, width, pixelsNum, dst, dst_stride); | 
 | } | 
 |  | 
 | static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 }; | 
 |  | 
 | static INLINE void transpose_pair(__m128i *in, __m128i *out) { | 
 |   __m128i x0, x1; | 
 |  | 
 |   x0 = _mm_unpacklo_epi32(in[0], in[1]); | 
 |   x1 = _mm_unpacklo_epi32(in[2], in[3]); | 
 |  | 
 |   out[0] = _mm_unpacklo_epi64(x0, x1); | 
 |   out[1] = _mm_unpackhi_epi64(x0, x1); | 
 |  | 
 |   x0 = _mm_unpackhi_epi32(in[0], in[1]); | 
 |   x1 = _mm_unpackhi_epi32(in[2], in[3]); | 
 |  | 
 |   out[2] = _mm_unpacklo_epi64(x0, x1); | 
 |   out[3] = _mm_unpackhi_epi64(x0, x1); | 
 |  | 
 |   x0 = _mm_unpacklo_epi32(in[4], in[5]); | 
 |   x1 = _mm_unpacklo_epi32(in[6], in[7]); | 
 |  | 
 |   out[4] = _mm_unpacklo_epi64(x0, x1); | 
 |   out[5] = _mm_unpackhi_epi64(x0, x1); | 
 | } | 
 |  | 
 | static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f, | 
 |                                 int tapsNum, uint32_t *buf) { | 
 |   __m128i u[8], v[6]; | 
 |  | 
 |   if (tapsNum == 10) { | 
 |     src -= 1; | 
 |   } | 
 |  | 
 |   u[0] = _mm_loadu_si128((__m128i const *)src); | 
 |   u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); | 
 |   u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); | 
 |   u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); | 
 |  | 
 |   u[4] = _mm_loadu_si128((__m128i const *)(src + 8)); | 
 |   u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8)); | 
 |   u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8)); | 
 |   u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8)); | 
 |  | 
 |   transpose_pair(u, v); | 
 |  | 
 |   u[0] = _mm_madd_epi16(v[0], f[0]); | 
 |   u[1] = _mm_madd_epi16(v[1], f[1]); | 
 |   u[2] = _mm_madd_epi16(v[2], f[2]); | 
 |   u[3] = _mm_madd_epi16(v[3], f[3]); | 
 |   u[4] = _mm_madd_epi16(v[4], f[4]); | 
 |   u[5] = _mm_madd_epi16(v[5], f[5]); | 
 |  | 
 |   u[6] = _mm_min_epi32(u[2], u[3]); | 
 |   u[7] = _mm_max_epi32(u[2], u[3]); | 
 |  | 
 |   u[0] = _mm_add_epi32(u[0], u[1]); | 
 |   u[0] = _mm_add_epi32(u[0], u[5]); | 
 |   u[0] = _mm_add_epi32(u[0], u[4]); | 
 |   u[0] = _mm_add_epi32(u[0], u[6]); | 
 |   u[0] = _mm_add_epi32(u[0], u[7]); | 
 |  | 
 |   _mm_storeu_si128((__m128i *)buf, u[0]); | 
 | } | 
 |  | 
 | void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride, | 
 |                                       uint16_t *dst, int dst_stride, int w, | 
 |                                       int h, | 
 |                                       const InterpFilterParams filter_params, | 
 |                                       const int subpel_x_q4, int x_step_q4, | 
 |                                       int avg, int bd) { | 
 |   DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]); | 
 |   __m128i verf[6]; | 
 |   HbdSubpelFilterCoeffs vCoeffs; | 
 |   const uint16_t *srcPtr; | 
 |   const int tapsNum = filter_params.taps; | 
 |   int i, col, count, blkResidu, blkHeight; | 
 |   TransposeSave transSave = transSaveTab[avg]; | 
 |   (void)x_step_q4; | 
 |  | 
 |   if (0 == subpel_x_q4 || 16 != x_step_q4) { | 
 |     av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, | 
 |                                 filter_params, subpel_x_q4, x_step_q4, avg, bd); | 
 |     return; | 
 |   } | 
 |  | 
 |   vCoeffs = | 
 |       hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1); | 
 |   if (!vCoeffs) { | 
 |     av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, | 
 |                                 filter_params, subpel_x_q4, x_step_q4, avg, bd); | 
 |     return; | 
 |   } | 
 |  | 
 |   verf[0] = *((const __m128i *)(vCoeffs)); | 
 |   verf[1] = *((const __m128i *)(vCoeffs + 1)); | 
 |   verf[2] = *((const __m128i *)(vCoeffs + 2)); | 
 |   verf[3] = *((const __m128i *)(vCoeffs + 3)); | 
 |   verf[4] = *((const __m128i *)(vCoeffs + 4)); | 
 |   verf[5] = *((const __m128i *)(vCoeffs + 5)); | 
 |  | 
 |   src -= (tapsNum >> 1) - 1; | 
 |   srcPtr = src; | 
 |  | 
 |   count = 0; | 
 |   blkHeight = h >> 2; | 
 |   blkResidu = h & 3; | 
 |  | 
 |   while (blkHeight != 0) { | 
 |     for (col = 0; col < w; col += 4) { | 
 |       for (i = 0; i < 4; ++i) { | 
 |         highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); | 
 |         srcPtr += 1; | 
 |       } | 
 |       transSave(w, 0, temp, 4, dst + col, dst_stride, bd); | 
 |     } | 
 |     count++; | 
 |     srcPtr = src + count * src_stride * 4; | 
 |     dst += dst_stride * 4; | 
 |     blkHeight--; | 
 |   } | 
 |  | 
 |   if (blkResidu == 0) return; | 
 |  | 
 |   for (col = 0; col < w; col += 4) { | 
 |     for (i = 0; i < 4; ++i) { | 
 |       highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); | 
 |       srcPtr += 1; | 
 |     } | 
 |     transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd); | 
 |   } | 
 | } | 
 |  | 
 | // Vertical convolutional filter | 
 |  | 
 | typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst); | 
 |  | 
 | static void highbdRndingPacks(__m128i *u) { | 
 |   __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); | 
 |   u[0] = _mm_add_epi32(u[0], rnd); | 
 |   u[0] = _mm_srai_epi32(u[0], FILTER_BITS); | 
 |   u[0] = _mm_packus_epi32(u[0], u[0]); | 
 | } | 
 |  | 
 | static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) { | 
 |   highbdRndingPacks(u); | 
 |   highbd_clip(u, 1, bd); | 
 |   *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]); | 
 | } | 
 |  | 
 | static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { | 
 |   __m128i v = _mm_loadl_epi64((__m128i const *)dst); | 
 |   const __m128i ones = _mm_set1_epi16(1); | 
 |  | 
 |   highbdRndingPacks(u); | 
 |   highbd_clip(u, 1, bd); | 
 |  | 
 |   v = _mm_add_epi16(v, u[0]); | 
 |   v = _mm_add_epi16(v, ones); | 
 |   v = _mm_srai_epi16(v, 1); | 
 |   *(uint32_t *)dst = _mm_cvtsi128_si32(v); | 
 | } | 
 |  | 
 | WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum }; | 
 |  | 
 | static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) { | 
 |   highbdRndingPacks(u); | 
 |   highbd_clip(u, 1, bd); | 
 |   _mm_storel_epi64((__m128i *)dst, u[0]); | 
 | } | 
 |  | 
 | static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { | 
 |   __m128i v = _mm_loadl_epi64((__m128i const *)dst); | 
 |   const __m128i ones = _mm_set1_epi16(1); | 
 |  | 
 |   highbdRndingPacks(u); | 
 |   highbd_clip(u, 1, bd); | 
 |  | 
 |   v = _mm_add_epi16(v, u[0]); | 
 |   v = _mm_add_epi16(v, ones); | 
 |   v = _mm_srai_epi16(v, 1); | 
 |   _mm_storel_epi64((__m128i *)dst, v); | 
 | } | 
 |  | 
 | WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum }; | 
 |  | 
 | static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride, | 
 |                                        const __m128i *f, int taps, | 
 |                                        uint16_t *dst, WritePixels saveFunc, | 
 |                                        int bd) { | 
 |   __m128i s[12]; | 
 |   __m128i zero = _mm_setzero_si128(); | 
 |   int i = 0; | 
 |   int r = 0; | 
 |  | 
 |   // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case | 
 |   if (10 == taps) { | 
 |     i += 1; | 
 |     s[0] = zero; | 
 |   } | 
 |   while (i < 12) { | 
 |     s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride)); | 
 |     i += 1; | 
 |     r += 1; | 
 |   } | 
 |  | 
 |   s[0] = _mm_unpacklo_epi16(s[0], s[1]); | 
 |   s[2] = _mm_unpacklo_epi16(s[2], s[3]); | 
 |   s[4] = _mm_unpacklo_epi16(s[4], s[5]); | 
 |   s[6] = _mm_unpacklo_epi16(s[6], s[7]); | 
 |   s[8] = _mm_unpacklo_epi16(s[8], s[9]); | 
 |   s[10] = _mm_unpacklo_epi16(s[10], s[11]); | 
 |  | 
 |   s[0] = _mm_madd_epi16(s[0], f[0]); | 
 |   s[2] = _mm_madd_epi16(s[2], f[1]); | 
 |   s[4] = _mm_madd_epi16(s[4], f[2]); | 
 |   s[6] = _mm_madd_epi16(s[6], f[3]); | 
 |   s[8] = _mm_madd_epi16(s[8], f[4]); | 
 |   s[10] = _mm_madd_epi16(s[10], f[5]); | 
 |  | 
 |   s[1] = _mm_min_epi32(s[4], s[6]); | 
 |   s[3] = _mm_max_epi32(s[4], s[6]); | 
 |  | 
 |   s[0] = _mm_add_epi32(s[0], s[2]); | 
 |   s[0] = _mm_add_epi32(s[0], s[10]); | 
 |   s[0] = _mm_add_epi32(s[0], s[8]); | 
 |   s[0] = _mm_add_epi32(s[0], s[1]); | 
 |   s[0] = _mm_add_epi32(s[0], s[3]); | 
 |  | 
 |   saveFunc(s, bd, dst); | 
 | } | 
 |  | 
 | static void highbd_filter_vert_compute_large(const uint16_t *src, | 
 |                                              int src_stride, const __m128i *f, | 
 |                                              int taps, int w, int h, | 
 |                                              uint16_t *dst, int dst_stride, | 
 |                                              int avg, int bd) { | 
 |   int col; | 
 |   int rowIndex = 0; | 
 |   const uint16_t *src_ptr = src; | 
 |   uint16_t *dst_ptr = dst; | 
 |   const int step = 4; | 
 |   WritePixels write4pixels = write4pixelsTab[avg]; | 
 |  | 
 |   do { | 
 |     for (col = 0; col < w; col += step) { | 
 |       filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr, | 
 |                                  write4pixels, bd); | 
 |       src_ptr += step; | 
 |       dst_ptr += step; | 
 |     } | 
 |     rowIndex++; | 
 |     src_ptr = src + rowIndex * src_stride; | 
 |     dst_ptr = dst + rowIndex * dst_stride; | 
 |   } while (rowIndex < h); | 
 | } | 
 |  | 
 | static void highbd_filter_vert_compute_small(const uint16_t *src, | 
 |                                              int src_stride, const __m128i *f, | 
 |                                              int taps, int w, int h, | 
 |                                              uint16_t *dst, int dst_stride, | 
 |                                              int avg, int bd) { | 
 |   int rowIndex = 0; | 
 |   WritePixels write2pixels = write2pixelsTab[avg]; | 
 |   (void)w; | 
 |  | 
 |   do { | 
 |     filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd); | 
 |     rowIndex++; | 
 |     src += src_stride; | 
 |     dst += dst_stride; | 
 |   } while (rowIndex < h); | 
 | } | 
 |  | 
 | void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, | 
 |                                      uint16_t *dst, int dst_stride, int w, | 
 |                                      int h, | 
 |                                      const InterpFilterParams filter_params, | 
 |                                      const int subpel_y_q4, int y_step_q4, | 
 |                                      int avg, int bd) { | 
 |   __m128i verf[6]; | 
 |   HbdSubpelFilterCoeffs vCoeffs; | 
 |   const int tapsNum = filter_params.taps; | 
 |  | 
 |   if (0 == subpel_y_q4 || 16 != y_step_q4) { | 
 |     av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, | 
 |                                filter_params, subpel_y_q4, y_step_q4, avg, bd); | 
 |     return; | 
 |   } | 
 |  | 
 |   vCoeffs = | 
 |       hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1); | 
 |   if (!vCoeffs) { | 
 |     av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, | 
 |                                filter_params, subpel_y_q4, y_step_q4, avg, bd); | 
 |     return; | 
 |   } | 
 |  | 
 |   verf[0] = *((const __m128i *)(vCoeffs)); | 
 |   verf[1] = *((const __m128i *)(vCoeffs + 1)); | 
 |   verf[2] = *((const __m128i *)(vCoeffs + 2)); | 
 |   verf[3] = *((const __m128i *)(vCoeffs + 3)); | 
 |   verf[4] = *((const __m128i *)(vCoeffs + 4)); | 
 |   verf[5] = *((const __m128i *)(vCoeffs + 5)); | 
 |  | 
 |   src -= src_stride * ((tapsNum >> 1) - 1); | 
 |  | 
 |   if (w > 2) { | 
 |     highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst, | 
 |                                      dst_stride, avg, bd); | 
 |   } else { | 
 |     highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst, | 
 |                                      dst_stride, avg, bd); | 
 |   } | 
 | } |