Deepa K G | c8e0336 | 2018-01-22 18:12:17 +0530 | [diff] [blame] | 1 | /* |
Johann | e8c1138 | 2018-02-08 14:32:00 -0800 | [diff] [blame] | 2 | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
Deepa K G | c8e0336 | 2018-01-22 18:12:17 +0530 | [diff] [blame] | 11 | |
| 12 | #ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ |
| 13 | #define AOM_DSP_X86_CONVOLVE_AVX2_H_ |
| 14 | |
| 15 | // filters for 16 |
| 16 | DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { |
| 17 | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
| 18 | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 19 | }; |
| 20 | |
| 21 | DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { |
| 22 | 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
| 23 | 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 24 | }; |
| 25 | |
| 26 | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
| 27 | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
| 28 | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 29 | }; |
| 30 | |
| 31 | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
| 32 | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
| 33 | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
| 34 | }; |
| 35 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 36 | static INLINE void prepare_coeffs_lowbd( |
| 37 | const InterpFilterParams *const filter_params, const int subpel_q4, |
| 38 | __m256i *const coeffs /* [4] */) { |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 39 | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
| 40 | *filter_params, subpel_q4 & SUBPEL_MASK); |
| 41 | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
| 42 | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
| 43 | |
| 44 | // right shift all filter co-efficients by 1 to reduce the bits required. |
| 45 | // This extra right shift will be taken care of at the end while rounding |
| 46 | // the result. |
| 47 | // Since all filter co-efficients are even, this change will not affect the |
| 48 | // end result |
| 49 | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
| 50 | _mm_set1_epi16(0xffff))); |
| 51 | |
| 52 | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
| 53 | |
| 54 | // coeffs 0 1 0 1 0 1 0 1 |
| 55 | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
| 56 | // coeffs 2 3 2 3 2 3 2 3 |
| 57 | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
| 58 | // coeffs 4 5 4 5 4 5 4 5 |
| 59 | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
| 60 | // coeffs 6 7 6 7 6 7 6 7 |
| 61 | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
| 62 | } |
| 63 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 64 | static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, |
| 65 | const int subpel_q4, |
| 66 | __m256i *const coeffs /* [4] */) { |
| 67 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
| 68 | *filter_params, subpel_q4 & SUBPEL_MASK); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 69 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 70 | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
| 71 | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 72 | |
| 73 | // coeffs 0 1 0 1 0 1 0 1 |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 74 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 75 | // coeffs 2 3 2 3 2 3 2 3 |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 76 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 77 | // coeffs 4 5 4 5 4 5 4 5 |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 78 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 79 | // coeffs 6 7 6 7 6 7 6 7 |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 80 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 81 | } |
| 82 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 83 | static INLINE __m256i convolve_lowbd(const __m256i *const s, |
| 84 | const __m256i *const coeffs) { |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 85 | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
| 86 | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
| 87 | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
| 88 | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
| 89 | |
| 90 | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| 91 | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
| 92 | _mm256_add_epi16(res_23, res_67)); |
| 93 | |
| 94 | return res; |
| 95 | } |
| 96 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 97 | static INLINE __m256i convolve(const __m256i *const s, |
| 98 | const __m256i *const coeffs) { |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 99 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
| 100 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
| 101 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
| 102 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
| 103 | |
| 104 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
| 105 | _mm256_add_epi32(res_2, res_3)); |
| 106 | |
| 107 | return res; |
| 108 | } |
| 109 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 110 | static INLINE __m256i convolve_lowbd_x(const __m256i data, |
| 111 | const __m256i *const coeffs, |
| 112 | const __m256i *const filt) { |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 113 | __m256i s[4]; |
| 114 | |
| 115 | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
| 116 | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
| 117 | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
| 118 | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
| 119 | |
Ravi Chaudhary | feb9c7a | 2018-02-23 19:23:47 +0530 | [diff] [blame] | 120 | return convolve_lowbd(s, coeffs); |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 121 | } |
| 122 | |
Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 123 | static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
| 124 | const __m256i *const res, |
| 125 | const int do_average) { |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 126 | __m256i d; |
Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 127 | if (do_average) { |
| 128 | d = _mm256_load_si256((__m256i *)dst); |
| 129 | d = _mm256_add_epi32(d, *res); |
| 130 | d = _mm256_srai_epi32(d, 1); |
| 131 | } else { |
| 132 | d = *res; |
| 133 | } |
Deepa K G | 0768d98 | 2018-02-05 13:41:42 +0530 | [diff] [blame] | 134 | _mm256_store_si256((__m256i *)dst, d); |
| 135 | } |
| 136 | |
Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 137 | static INLINE void mult_add_store_aligned_256(CONV_BUF_TYPE *const dst, |
| 138 | const __m256i *const res, |
| 139 | const __m256i *const wt0, |
| 140 | const __m256i *const wt1, |
| 141 | const int do_average) { |
| 142 | __m256i d; |
| 143 | if (do_average) { |
| 144 | d = _mm256_load_si256((__m256i *)dst); |
| 145 | d = _mm256_add_epi32(_mm256_mullo_epi32(d, *wt0), |
| 146 | _mm256_mullo_epi32(*res, *wt1)); |
| 147 | d = _mm256_srai_epi32(d, DIST_PRECISION_BITS); |
| 148 | } else { |
| 149 | d = *res; |
| 150 | } |
| 151 | _mm256_store_si256((__m256i *)dst, d); |
| 152 | } |
Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 153 | |
Deepa K G | c8e0336 | 2018-01-22 18:12:17 +0530 | [diff] [blame] | 154 | #endif |