blob: bac392d68ea2573ba09c5e92ab87fd54121d5126 [file] [log] [blame]
Deepa K Gc8e03362018-01-22 18:12:17 +05301/*
Johanne8c11382018-02-08 14:32:00 -08002 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
Deepa K Gc8e03362018-01-22 18:12:17 +053011
12#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
13#define AOM_DSP_X86_CONVOLVE_AVX2_H_
14
15// filters for 16
16DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
19};
20
21DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
22 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
23 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
24};
25
26DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
28 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
29};
30
31DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
33 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
34};
35
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053036static INLINE void prepare_coeffs_lowbd(
37 const InterpFilterParams *const filter_params, const int subpel_q4,
38 __m256i *const coeffs /* [4] */) {
Deepa K G0768d982018-02-05 13:41:42 +053039 const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
40 *filter_params, subpel_q4 & SUBPEL_MASK);
41 const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
42 const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
43
44 // right shift all filter co-efficients by 1 to reduce the bits required.
45 // This extra right shift will be taken care of at the end while rounding
46 // the result.
47 // Since all filter co-efficients are even, this change will not affect the
48 // end result
49 assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
50 _mm_set1_epi16(0xffff)));
51
52 const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
53
54 // coeffs 0 1 0 1 0 1 0 1
55 coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
56 // coeffs 2 3 2 3 2 3 2 3
57 coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
58 // coeffs 4 5 4 5 4 5 4 5
59 coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
60 // coeffs 6 7 6 7 6 7 6 7
61 coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
62}
63
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053064static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
65 const int subpel_q4,
66 __m256i *const coeffs /* [4] */) {
67 const int16_t *filter = av1_get_interp_filter_subpel_kernel(
68 *filter_params, subpel_q4 & SUBPEL_MASK);
Deepa K G0768d982018-02-05 13:41:42 +053069
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053070 const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
71 const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
Deepa K G0768d982018-02-05 13:41:42 +053072
73 // coeffs 0 1 0 1 0 1 0 1
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053074 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
Deepa K G0768d982018-02-05 13:41:42 +053075 // coeffs 2 3 2 3 2 3 2 3
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053076 coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
Deepa K G0768d982018-02-05 13:41:42 +053077 // coeffs 4 5 4 5 4 5 4 5
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053078 coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
Deepa K G0768d982018-02-05 13:41:42 +053079 // coeffs 6 7 6 7 6 7 6 7
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053080 coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
Deepa K G0768d982018-02-05 13:41:42 +053081}
82
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053083static INLINE __m256i convolve_lowbd(const __m256i *const s,
84 const __m256i *const coeffs) {
Deepa K G0768d982018-02-05 13:41:42 +053085 const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
86 const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
87 const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
88 const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
89
90 // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
91 const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
92 _mm256_add_epi16(res_23, res_67));
93
94 return res;
95}
96
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +053097static INLINE __m256i convolve(const __m256i *const s,
98 const __m256i *const coeffs) {
Deepa K G0768d982018-02-05 13:41:42 +053099 const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
100 const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
101 const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
102 const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
103
104 const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
105 _mm256_add_epi32(res_2, res_3));
106
107 return res;
108}
109
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +0530110static INLINE __m256i convolve_lowbd_x(const __m256i data,
111 const __m256i *const coeffs,
112 const __m256i *const filt) {
Deepa K G0768d982018-02-05 13:41:42 +0530113 __m256i s[4];
114
115 s[0] = _mm256_shuffle_epi8(data, filt[0]);
116 s[1] = _mm256_shuffle_epi8(data, filt[1]);
117 s[2] = _mm256_shuffle_epi8(data, filt[2]);
118 s[3] = _mm256_shuffle_epi8(data, filt[3]);
119
Ravi Chaudharyfeb9c7a2018-02-23 19:23:47 +0530120 return convolve_lowbd(s, coeffs);
Deepa K G0768d982018-02-05 13:41:42 +0530121}
122
Deepa K Gf2f276c2018-02-27 19:01:55 +0530123static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
124 const __m256i *const res,
125 const int do_average) {
Deepa K G0768d982018-02-05 13:41:42 +0530126 __m256i d;
Deepa K Gf2f276c2018-02-27 19:01:55 +0530127 if (do_average) {
128 d = _mm256_load_si256((__m256i *)dst);
129 d = _mm256_add_epi32(d, *res);
130 d = _mm256_srai_epi32(d, 1);
131 } else {
132 d = *res;
133 }
Deepa K G0768d982018-02-05 13:41:42 +0530134 _mm256_store_si256((__m256i *)dst, d);
135}
136
Deepa K Gf2f276c2018-02-27 19:01:55 +0530137static INLINE void mult_add_store_aligned_256(CONV_BUF_TYPE *const dst,
138 const __m256i *const res,
139 const __m256i *const wt0,
140 const __m256i *const wt1,
141 const int do_average) {
142 __m256i d;
143 if (do_average) {
144 d = _mm256_load_si256((__m256i *)dst);
145 d = _mm256_add_epi32(_mm256_mullo_epi32(d, *wt0),
146 _mm256_mullo_epi32(*res, *wt1));
147 d = _mm256_srai_epi32(d, DIST_PRECISION_BITS);
148 } else {
149 d = *res;
150 }
151 _mm256_store_si256((__m256i *)dst, d);
152}
Deepa K Gf2f276c2018-02-27 19:01:55 +0530153
Deepa K Gc8e03362018-01-22 18:12:17 +0530154#endif