| /* |
| * Copyright (c) 2016 The WebM project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include <assert.h> |
| #include <smmintrin.h> |
| |
| #include "./vp10_rtcd.h" |
| #include "vp10/common/filter.h" |
| |
| typedef void (*TransposeSave)(const int width, int pixelsNum, uint32_t *src, |
| int src_stride, uint16_t *dst, int dst_stride, |
| int bd); |
| |
| // pixelsNum 0: write all 4 pixels |
| // 1/2/3: residual pixels 1/2/3 |
| static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, |
| int dst_stride) { |
| if (2 == width) { |
| if (0 == pixelsNum) { |
| *(int *)dst = _mm_cvtsi128_si32(u[0]); |
| *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); |
| *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); |
| *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]); |
| } else if (1 == pixelsNum) { |
| *(int *)dst = _mm_cvtsi128_si32(u[0]); |
| } else if (2 == pixelsNum) { |
| *(int *)dst = _mm_cvtsi128_si32(u[0]); |
| *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); |
| } else if (3 == pixelsNum) { |
| *(int *)dst = _mm_cvtsi128_si32(u[0]); |
| *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); |
| *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); |
| } |
| } else { |
| if (0 == pixelsNum) { |
| _mm_storel_epi64((__m128i *)dst, u[0]); |
| _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); |
| _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]); |
| } else if (1 == pixelsNum) { |
| _mm_storel_epi64((__m128i *)dst, u[0]); |
| } else if (2 == pixelsNum) { |
| _mm_storel_epi64((__m128i *)dst, u[0]); |
| _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); |
| } else if (3 == pixelsNum) { |
| _mm_storel_epi64((__m128i *)dst, u[0]); |
| _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); |
| } |
| } |
| } |
| |
| // 16-bit pixels clip with bd (10/12) |
| static void highbd_clip(__m128i *p, int numVecs, int bd) { |
| const __m128i zero = _mm_setzero_si128(); |
| const __m128i one = _mm_set1_epi16(1); |
| const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); |
| __m128i clamped, mask; |
| int i; |
| |
| for (i = 0; i < numVecs; i++) { |
| mask = _mm_cmpgt_epi16(p[i], max); |
| clamped = _mm_andnot_si128(mask, p[i]); |
| mask = _mm_and_si128(mask, max); |
| clamped = _mm_or_si128(mask, clamped); |
| mask = _mm_cmpgt_epi16(clamped, zero); |
| p[i] = _mm_and_si128(clamped, mask); |
| } |
| } |
| |
| static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { |
| __m128i v0, v1; |
| __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); |
| |
| u[0] = _mm_loadu_si128((__m128i const *)src); |
| u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); |
| u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); |
| u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); |
| |
| u[0] = _mm_add_epi32(u[0], rnd); |
| u[1] = _mm_add_epi32(u[1], rnd); |
| u[2] = _mm_add_epi32(u[2], rnd); |
| u[3] = _mm_add_epi32(u[3], rnd); |
| |
| u[0] = _mm_srai_epi32(u[0], FILTER_BITS); |
| u[1] = _mm_srai_epi32(u[1], FILTER_BITS); |
| u[2] = _mm_srai_epi32(u[2], FILTER_BITS); |
| u[3] = _mm_srai_epi32(u[3], FILTER_BITS); |
| |
| u[0] = _mm_packus_epi32(u[0], u[1]); |
| u[1] = _mm_packus_epi32(u[2], u[3]); |
| |
| highbd_clip(u, 2, bd); |
| |
| v0 = _mm_unpacklo_epi16(u[0], u[1]); |
| v1 = _mm_unpackhi_epi16(u[0], u[1]); |
| |
| u[0] = _mm_unpacklo_epi16(v0, v1); |
| u[2] = _mm_unpackhi_epi16(v0, v1); |
| |
| u[1] = _mm_srli_si128(u[0], 8); |
| u[3] = _mm_srli_si128(u[2], 8); |
| } |
| |
| // pixelsNum = 0 : all 4 rows of pixels will be saved. |
| // pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved. |
| void trans_save_4x4(const int width, int pixelsNum, uint32_t *src, |
| int src_stride, uint16_t *dst, int dst_stride, int bd) { |
| __m128i u[4]; |
| transClipPixel(src, src_stride, u, bd); |
| writePixel(u, width, pixelsNum, dst, dst_stride); |
| } |
| |
| void trans_accum_save_4x4(const int width, int pixelsNum, uint32_t *src, |
| int src_stride, uint16_t *dst, int dst_stride, |
| int bd) { |
| __m128i u[4], v[4]; |
| const __m128i ones = _mm_set1_epi16(1); |
| |
| transClipPixel(src, src_stride, u, bd); |
| |
| v[0] = _mm_loadl_epi64((__m128i const *)dst); |
| v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride)); |
| v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride)); |
| v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride)); |
| |
| u[0] = _mm_add_epi16(u[0], v[0]); |
| u[1] = _mm_add_epi16(u[1], v[1]); |
| u[2] = _mm_add_epi16(u[2], v[2]); |
| u[3] = _mm_add_epi16(u[3], v[3]); |
| |
| u[0] = _mm_add_epi16(u[0], ones); |
| u[1] = _mm_add_epi16(u[1], ones); |
| u[2] = _mm_add_epi16(u[2], ones); |
| u[3] = _mm_add_epi16(u[3], ones); |
| |
| u[0] = _mm_srai_epi16(u[0], 1); |
| u[1] = _mm_srai_epi16(u[1], 1); |
| u[2] = _mm_srai_epi16(u[2], 1); |
| u[3] = _mm_srai_epi16(u[3], 1); |
| |
| writePixel(u, width, pixelsNum, dst, dst_stride); |
| } |
| |
| static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 }; |
| |
| static INLINE void transpose_pair(__m128i *in, __m128i *out) { |
| __m128i x0, x1; |
| |
| x0 = _mm_unpacklo_epi32(in[0], in[1]); |
| x1 = _mm_unpacklo_epi32(in[2], in[3]); |
| |
| out[0] = _mm_unpacklo_epi64(x0, x1); |
| out[1] = _mm_unpackhi_epi64(x0, x1); |
| |
| x0 = _mm_unpackhi_epi32(in[0], in[1]); |
| x1 = _mm_unpackhi_epi32(in[2], in[3]); |
| |
| out[2] = _mm_unpacklo_epi64(x0, x1); |
| out[3] = _mm_unpackhi_epi64(x0, x1); |
| |
| x0 = _mm_unpacklo_epi32(in[4], in[5]); |
| x1 = _mm_unpacklo_epi32(in[6], in[7]); |
| |
| out[4] = _mm_unpacklo_epi64(x0, x1); |
| out[5] = _mm_unpackhi_epi64(x0, x1); |
| } |
| |
| static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f, |
| int tapsNum, uint32_t *buf) { |
| __m128i u[8], v[6]; |
| |
| if (tapsNum == 10) { |
| src -= 1; |
| } |
| |
| u[0] = _mm_loadu_si128((__m128i const *)src); |
| u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); |
| u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); |
| u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); |
| |
| u[4] = _mm_loadu_si128((__m128i const *)(src + 8)); |
| u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8)); |
| u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8)); |
| u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8)); |
| |
| transpose_pair(u, v); |
| |
| u[0] = _mm_madd_epi16(v[0], f[0]); |
| u[1] = _mm_madd_epi16(v[1], f[1]); |
| u[2] = _mm_madd_epi16(v[2], f[2]); |
| u[3] = _mm_madd_epi16(v[3], f[3]); |
| u[4] = _mm_madd_epi16(v[4], f[4]); |
| u[5] = _mm_madd_epi16(v[5], f[5]); |
| |
| u[6] = _mm_min_epi32(u[2], u[3]); |
| u[7] = _mm_max_epi32(u[2], u[3]); |
| |
| u[0] = _mm_add_epi32(u[0], u[1]); |
| u[0] = _mm_add_epi32(u[0], u[5]); |
| u[0] = _mm_add_epi32(u[0], u[4]); |
| u[0] = _mm_add_epi32(u[0], u[6]); |
| u[0] = _mm_add_epi32(u[0], u[7]); |
| |
| _mm_storeu_si128((__m128i *)buf, u[0]); |
| } |
| |
| void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride, |
| uint16_t *dst, int dst_stride, int w, |
| int h, |
| const InterpFilterParams filter_params, |
| const int subpel_x_q4, int x_step_q4, |
| int avg, int bd) { |
| DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]); |
| __m128i verf[6]; |
| HbdSubpelFilterCoeffs vCoeffs; |
| const uint16_t *srcPtr; |
| const int tapsNum = filter_params.taps; |
| int i, col, count, blkResidu, blkHeight; |
| TransposeSave transSave = transSaveTab[avg]; |
| (void)x_step_q4; |
| |
| if (0 == subpel_x_q4 || 16 != x_step_q4) { |
| vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, |
| filter_params, subpel_x_q4, x_step_q4, avg, |
| bd); |
| return; |
| } |
| |
| vCoeffs = |
| vp10_hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1); |
| if (!vCoeffs) { |
| vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, |
| filter_params, subpel_x_q4, x_step_q4, avg, |
| bd); |
| return; |
| } |
| |
| verf[0] = *((const __m128i *)(vCoeffs)); |
| verf[1] = *((const __m128i *)(vCoeffs + 1)); |
| verf[2] = *((const __m128i *)(vCoeffs + 2)); |
| verf[3] = *((const __m128i *)(vCoeffs + 3)); |
| verf[4] = *((const __m128i *)(vCoeffs + 4)); |
| verf[5] = *((const __m128i *)(vCoeffs + 5)); |
| |
| src -= (tapsNum >> 1) - 1; |
| srcPtr = src; |
| |
| count = 0; |
| blkHeight = h >> 2; |
| blkResidu = h & 3; |
| |
| while (blkHeight != 0) { |
| for (col = 0; col < w; col += 4) { |
| for (i = 0; i < 4; ++i) { |
| highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); |
| srcPtr += 1; |
| } |
| transSave(w, 0, temp, 4, dst + col, dst_stride, bd); |
| } |
| count++; |
| srcPtr = src + count * src_stride * 4; |
| dst += dst_stride * 4; |
| blkHeight--; |
| } |
| |
| if (blkResidu == 0) return; |
| |
| for (col = 0; col < w; col += 4) { |
| for (i = 0; i < 4; ++i) { |
| highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4)); |
| srcPtr += 1; |
| } |
| transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd); |
| } |
| } |
| |
| // Vertical convolutional filter |
| |
| typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst); |
| |
| static void highbdRndingPacks(__m128i *u) { |
| __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); |
| u[0] = _mm_add_epi32(u[0], rnd); |
| u[0] = _mm_srai_epi32(u[0], FILTER_BITS); |
| u[0] = _mm_packus_epi32(u[0], u[0]); |
| } |
| |
| static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) { |
| highbdRndingPacks(u); |
| highbd_clip(u, 1, bd); |
| *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]); |
| } |
| |
| static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { |
| __m128i v = _mm_loadl_epi64((__m128i const *)dst); |
| const __m128i ones = _mm_set1_epi16(1); |
| |
| highbdRndingPacks(u); |
| highbd_clip(u, 1, bd); |
| |
| v = _mm_add_epi16(v, u[0]); |
| v = _mm_add_epi16(v, ones); |
| v = _mm_srai_epi16(v, 1); |
| *(uint32_t *)dst = _mm_cvtsi128_si32(v); |
| } |
| |
| WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum }; |
| |
| static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) { |
| highbdRndingPacks(u); |
| highbd_clip(u, 1, bd); |
| _mm_storel_epi64((__m128i *)dst, u[0]); |
| } |
| |
| static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { |
| __m128i v = _mm_loadl_epi64((__m128i const *)dst); |
| const __m128i ones = _mm_set1_epi16(1); |
| |
| highbdRndingPacks(u); |
| highbd_clip(u, 1, bd); |
| |
| v = _mm_add_epi16(v, u[0]); |
| v = _mm_add_epi16(v, ones); |
| v = _mm_srai_epi16(v, 1); |
| _mm_storel_epi64((__m128i *)dst, v); |
| } |
| |
| WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum }; |
| |
| static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride, |
| const __m128i *f, int taps, |
| uint16_t *dst, WritePixels saveFunc, |
| int bd) { |
| __m128i s[12]; |
| __m128i zero = _mm_setzero_si128(); |
| int i = 0; |
| int r = 0; |
| |
| // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case |
| if (10 == taps) { |
| i += 1; |
| s[0] = zero; |
| } |
| while (i < 12) { |
| s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride)); |
| i += 1; |
| r += 1; |
| } |
| |
| s[0] = _mm_unpacklo_epi16(s[0], s[1]); |
| s[2] = _mm_unpacklo_epi16(s[2], s[3]); |
| s[4] = _mm_unpacklo_epi16(s[4], s[5]); |
| s[6] = _mm_unpacklo_epi16(s[6], s[7]); |
| s[8] = _mm_unpacklo_epi16(s[8], s[9]); |
| s[10] = _mm_unpacklo_epi16(s[10], s[11]); |
| |
| s[0] = _mm_madd_epi16(s[0], f[0]); |
| s[2] = _mm_madd_epi16(s[2], f[1]); |
| s[4] = _mm_madd_epi16(s[4], f[2]); |
| s[6] = _mm_madd_epi16(s[6], f[3]); |
| s[8] = _mm_madd_epi16(s[8], f[4]); |
| s[10] = _mm_madd_epi16(s[10], f[5]); |
| |
| s[1] = _mm_min_epi32(s[4], s[6]); |
| s[3] = _mm_max_epi32(s[4], s[6]); |
| |
| s[0] = _mm_add_epi32(s[0], s[2]); |
| s[0] = _mm_add_epi32(s[0], s[10]); |
| s[0] = _mm_add_epi32(s[0], s[8]); |
| s[0] = _mm_add_epi32(s[0], s[1]); |
| s[0] = _mm_add_epi32(s[0], s[3]); |
| |
| saveFunc(s, bd, dst); |
| } |
| |
| static void highbd_filter_vert_compute_large(const uint16_t *src, |
| int src_stride, const __m128i *f, |
| int taps, int w, int h, |
| uint16_t *dst, int dst_stride, |
| int avg, int bd) { |
| int col; |
| int rowIndex = 0; |
| const uint16_t *src_ptr = src; |
| uint16_t *dst_ptr = dst; |
| const int step = 4; |
| WritePixels write4pixels = write4pixelsTab[avg]; |
| |
| do { |
| for (col = 0; col < w; col += step) { |
| filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr, |
| write4pixels, bd); |
| src_ptr += step; |
| dst_ptr += step; |
| } |
| rowIndex++; |
| src_ptr = src + rowIndex * src_stride; |
| dst_ptr = dst + rowIndex * dst_stride; |
| } while (rowIndex < h); |
| } |
| |
| static void highbd_filter_vert_compute_small(const uint16_t *src, |
| int src_stride, const __m128i *f, |
| int taps, int w, int h, |
| uint16_t *dst, int dst_stride, |
| int avg, int bd) { |
| int rowIndex = 0; |
| WritePixels write2pixels = write2pixelsTab[avg]; |
| (void)w; |
| |
| do { |
| filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd); |
| rowIndex++; |
| src += src_stride; |
| dst += dst_stride; |
| } while (rowIndex < h); |
| } |
| |
| void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride, |
| uint16_t *dst, int dst_stride, int w, |
| int h, |
| const InterpFilterParams filter_params, |
| const int subpel_y_q4, int y_step_q4, |
| int avg, int bd) { |
| __m128i verf[6]; |
| HbdSubpelFilterCoeffs vCoeffs; |
| const int tapsNum = filter_params.taps; |
| |
| if (0 == subpel_y_q4 || 16 != y_step_q4) { |
| vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, |
| filter_params, subpel_y_q4, y_step_q4, avg, bd); |
| return; |
| } |
| |
| vCoeffs = |
| vp10_hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1); |
| if (!vCoeffs) { |
| vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, |
| filter_params, subpel_y_q4, y_step_q4, avg, bd); |
| return; |
| } |
| |
| verf[0] = *((const __m128i *)(vCoeffs)); |
| verf[1] = *((const __m128i *)(vCoeffs + 1)); |
| verf[2] = *((const __m128i *)(vCoeffs + 2)); |
| verf[3] = *((const __m128i *)(vCoeffs + 3)); |
| verf[4] = *((const __m128i *)(vCoeffs + 4)); |
| verf[5] = *((const __m128i *)(vCoeffs + 5)); |
| |
| src -= src_stride * ((tapsNum >> 1) - 1); |
| |
| if (w > 2) { |
| highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst, |
| dst_stride, avg, bd); |
| } else { |
| highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst, |
| dst_stride, avg, bd); |
| } |
| } |