Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Lester Lu | 6bc30d6 | 2021-12-16 19:13:21 +0000 | [diff] [blame^] | 2 | * Copyright (c) 2021, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Lester Lu | 6bc30d6 | 2021-12-16 19:13:21 +0000 | [diff] [blame^] | 4 | * This source code is subject to the terms of the BSD 3-Clause Clear License |
| 5 | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| 6 | * License was not distributed with this source code in the LICENSE file, you |
| 7 | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| 8 | * Alliance for Open Media Patent License 1.0 was not distributed with this |
| 9 | * source code in the PATENTS file, you can obtain it at |
| 10 | * aomedia.org/license/patent-license/. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 11 | */ |
| 12 | |
| 13 | #include <assert.h> |
| 14 | #include <immintrin.h> |
| 15 | |
| 16 | #include "aom_dsp/x86/synonyms.h" |
| 17 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 18 | #include "aom/aom_integer.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 19 | |
| 20 | #include "av1/common/reconinter.h" |
| 21 | |
| 22 | #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) |
| 23 | |
| 24 | /** |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 25 | * See av1_wedge_sse_from_residuals_c |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 26 | */ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 27 | uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, |
| 28 | const uint8_t *m, int N) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 29 | int n = -N; |
| 30 | int n8 = n + 8; |
| 31 | |
| 32 | uint64_t csse; |
| 33 | |
| 34 | const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); |
Imdad Sardharwalla | 5123251 | 2018-04-30 14:41:28 +0100 | [diff] [blame] | 35 | const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 36 | |
| 37 | __m128i v_acc0_q = _mm_setzero_si128(); |
| 38 | |
| 39 | assert(N % 64 == 0); |
| 40 | |
| 41 | r1 += N; |
| 42 | d += N; |
| 43 | m += N; |
| 44 | |
| 45 | do { |
| 46 | const __m128i v_r0_w = xx_load_128(r1 + n); |
| 47 | const __m128i v_r1_w = xx_load_128(r1 + n8); |
| 48 | const __m128i v_d0_w = xx_load_128(d + n); |
| 49 | const __m128i v_d1_w = xx_load_128(d + n8); |
| 50 | const __m128i v_m01_b = xx_load_128(m + n); |
| 51 | |
| 52 | const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); |
| 53 | const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); |
| 54 | const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); |
| 55 | const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); |
| 56 | const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); |
| 57 | const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); |
| 58 | |
| 59 | const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); |
| 60 | const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); |
| 61 | const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); |
| 62 | const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); |
| 63 | |
| 64 | const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); |
| 65 | const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); |
| 66 | const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); |
| 67 | const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); |
| 68 | |
| 69 | const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); |
| 70 | const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); |
| 71 | |
| 72 | const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); |
| 73 | const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); |
| 74 | |
| 75 | const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), |
| 76 | _mm_srli_epi64(v_sq0_d, 32)); |
| 77 | const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), |
| 78 | _mm_srli_epi64(v_sq1_d, 32)); |
| 79 | |
| 80 | v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); |
| 81 | v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); |
| 82 | |
| 83 | n8 += 16; |
| 84 | n += 16; |
| 85 | } while (n); |
| 86 | |
| 87 | v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); |
| 88 | |
| 89 | #if ARCH_X86_64 |
| 90 | csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); |
| 91 | #else |
| 92 | xx_storel_64(&csse, v_acc0_q); |
| 93 | #endif |
| 94 | |
| 95 | return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); |
| 96 | } |
| 97 | |
| 98 | /** |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 99 | * See av1_wedge_sign_from_residuals_c |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 100 | */ |
Remya | f75b984 | 2019-06-13 15:54:17 +0530 | [diff] [blame] | 101 | int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, |
| 102 | int N, int64_t limit) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 103 | int64_t acc; |
| 104 | |
| 105 | __m128i v_sign_d; |
| 106 | __m128i v_acc0_d = _mm_setzero_si128(); |
| 107 | __m128i v_acc1_d = _mm_setzero_si128(); |
| 108 | __m128i v_acc_q; |
| 109 | |
| 110 | // Input size limited to 8192 by the use of 32 bit accumulators and m |
| 111 | // being between [0, 64]. Overflow might happen at larger sizes, |
| 112 | // though it is practically impossible on real video input. |
| 113 | assert(N < 8192); |
| 114 | assert(N % 64 == 0); |
| 115 | |
| 116 | do { |
| 117 | const __m128i v_m01_b = xx_load_128(m); |
| 118 | const __m128i v_m23_b = xx_load_128(m + 16); |
| 119 | const __m128i v_m45_b = xx_load_128(m + 32); |
| 120 | const __m128i v_m67_b = xx_load_128(m + 48); |
| 121 | |
| 122 | const __m128i v_d0_w = xx_load_128(ds); |
| 123 | const __m128i v_d1_w = xx_load_128(ds + 8); |
| 124 | const __m128i v_d2_w = xx_load_128(ds + 16); |
| 125 | const __m128i v_d3_w = xx_load_128(ds + 24); |
| 126 | const __m128i v_d4_w = xx_load_128(ds + 32); |
| 127 | const __m128i v_d5_w = xx_load_128(ds + 40); |
| 128 | const __m128i v_d6_w = xx_load_128(ds + 48); |
| 129 | const __m128i v_d7_w = xx_load_128(ds + 56); |
| 130 | |
| 131 | const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); |
| 132 | const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); |
| 133 | const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); |
| 134 | const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); |
| 135 | const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); |
| 136 | const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); |
| 137 | const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); |
| 138 | const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); |
| 139 | |
| 140 | const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); |
| 141 | const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); |
| 142 | const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); |
| 143 | const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); |
| 144 | const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); |
| 145 | const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); |
| 146 | const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); |
| 147 | const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); |
| 148 | |
| 149 | const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); |
| 150 | const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); |
| 151 | const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); |
| 152 | const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); |
| 153 | |
| 154 | const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); |
| 155 | const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); |
| 156 | |
| 157 | v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); |
| 158 | v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); |
| 159 | |
| 160 | ds += 64; |
| 161 | m += 64; |
| 162 | |
| 163 | N -= 64; |
| 164 | } while (N); |
| 165 | |
| 166 | v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); |
| 167 | v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), |
| 168 | _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); |
| 169 | |
| 170 | v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); |
| 171 | v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), |
| 172 | _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); |
| 173 | |
| 174 | v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); |
| 175 | |
| 176 | v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); |
| 177 | |
| 178 | #if ARCH_X86_64 |
| 179 | acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); |
| 180 | #else |
| 181 | xx_storel_64(&acc, v_acc_q); |
| 182 | #endif |
| 183 | |
| 184 | return acc > limit; |
| 185 | } |
| 186 | |
| 187 | // Negate under mask |
| 188 | static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { |
| 189 | return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); |
| 190 | } |
| 191 | |
| 192 | /** |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 193 | * av1_wedge_compute_delta_squares_c |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 194 | */ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 195 | void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, |
| 196 | const int16_t *b, int N) { |
Hien Ho | 710b12f | 2019-08-26 10:08:36 -0700 | [diff] [blame] | 197 | const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, |
| 198 | (short)0xffff, 0, (short)0xffff, 0); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 199 | |
| 200 | assert(N % 64 == 0); |
| 201 | |
| 202 | do { |
| 203 | const __m128i v_a0_w = xx_load_128(a); |
| 204 | const __m128i v_b0_w = xx_load_128(b); |
| 205 | const __m128i v_a1_w = xx_load_128(a + 8); |
| 206 | const __m128i v_b1_w = xx_load_128(b + 8); |
| 207 | const __m128i v_a2_w = xx_load_128(a + 16); |
| 208 | const __m128i v_b2_w = xx_load_128(b + 16); |
| 209 | const __m128i v_a3_w = xx_load_128(a + 24); |
| 210 | const __m128i v_b3_w = xx_load_128(b + 24); |
| 211 | |
| 212 | const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); |
| 213 | const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); |
| 214 | const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); |
| 215 | const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); |
| 216 | const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); |
| 217 | const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); |
| 218 | const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); |
| 219 | const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); |
| 220 | |
| 221 | // Negate top word of pairs |
| 222 | const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); |
| 223 | const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); |
| 224 | const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); |
| 225 | const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); |
| 226 | const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); |
| 227 | const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); |
| 228 | const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); |
| 229 | const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); |
| 230 | |
| 231 | const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); |
| 232 | const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); |
| 233 | const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); |
| 234 | const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); |
| 235 | const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); |
| 236 | const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); |
| 237 | const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); |
| 238 | const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); |
| 239 | |
| 240 | const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); |
| 241 | const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); |
| 242 | const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); |
| 243 | const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); |
| 244 | |
| 245 | xx_store_128(d, v_r0_w); |
| 246 | xx_store_128(d + 8, v_r1_w); |
| 247 | xx_store_128(d + 16, v_r2_w); |
| 248 | xx_store_128(d + 24, v_r3_w); |
| 249 | |
| 250 | a += 32; |
| 251 | b += 32; |
| 252 | d += 32; |
| 253 | N -= 32; |
| 254 | } while (N); |
| 255 | } |