Yue Chen | 7cae98f | 2018-08-24 10:43:16 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <immintrin.h> |
| 13 | |
| 14 | #include "config/aom_dsp_rtcd.h" |
| 15 | #include "aom/aom_integer.h" |
| 16 | #include "aom_dsp/x86/bitdepth_conversion_sse2.h" |
| 17 | #include "aom_ports/mem.h" |
| 18 | |
| 19 | static void hadamard_col8_sse2(__m128i *in, int iter) { |
| 20 | __m128i a0 = in[0]; |
| 21 | __m128i a1 = in[1]; |
| 22 | __m128i a2 = in[2]; |
| 23 | __m128i a3 = in[3]; |
| 24 | __m128i a4 = in[4]; |
| 25 | __m128i a5 = in[5]; |
| 26 | __m128i a6 = in[6]; |
| 27 | __m128i a7 = in[7]; |
| 28 | |
| 29 | __m128i b0 = _mm_add_epi16(a0, a1); |
| 30 | __m128i b1 = _mm_sub_epi16(a0, a1); |
| 31 | __m128i b2 = _mm_add_epi16(a2, a3); |
| 32 | __m128i b3 = _mm_sub_epi16(a2, a3); |
| 33 | __m128i b4 = _mm_add_epi16(a4, a5); |
| 34 | __m128i b5 = _mm_sub_epi16(a4, a5); |
| 35 | __m128i b6 = _mm_add_epi16(a6, a7); |
| 36 | __m128i b7 = _mm_sub_epi16(a6, a7); |
| 37 | |
| 38 | a0 = _mm_add_epi16(b0, b2); |
| 39 | a1 = _mm_add_epi16(b1, b3); |
| 40 | a2 = _mm_sub_epi16(b0, b2); |
| 41 | a3 = _mm_sub_epi16(b1, b3); |
| 42 | a4 = _mm_add_epi16(b4, b6); |
| 43 | a5 = _mm_add_epi16(b5, b7); |
| 44 | a6 = _mm_sub_epi16(b4, b6); |
| 45 | a7 = _mm_sub_epi16(b5, b7); |
| 46 | |
| 47 | if (iter == 0) { |
| 48 | b0 = _mm_add_epi16(a0, a4); |
| 49 | b7 = _mm_add_epi16(a1, a5); |
| 50 | b3 = _mm_add_epi16(a2, a6); |
| 51 | b4 = _mm_add_epi16(a3, a7); |
| 52 | b2 = _mm_sub_epi16(a0, a4); |
| 53 | b6 = _mm_sub_epi16(a1, a5); |
| 54 | b1 = _mm_sub_epi16(a2, a6); |
| 55 | b5 = _mm_sub_epi16(a3, a7); |
| 56 | |
| 57 | a0 = _mm_unpacklo_epi16(b0, b1); |
| 58 | a1 = _mm_unpacklo_epi16(b2, b3); |
| 59 | a2 = _mm_unpackhi_epi16(b0, b1); |
| 60 | a3 = _mm_unpackhi_epi16(b2, b3); |
| 61 | a4 = _mm_unpacklo_epi16(b4, b5); |
| 62 | a5 = _mm_unpacklo_epi16(b6, b7); |
| 63 | a6 = _mm_unpackhi_epi16(b4, b5); |
| 64 | a7 = _mm_unpackhi_epi16(b6, b7); |
| 65 | |
| 66 | b0 = _mm_unpacklo_epi32(a0, a1); |
| 67 | b1 = _mm_unpacklo_epi32(a4, a5); |
| 68 | b2 = _mm_unpackhi_epi32(a0, a1); |
| 69 | b3 = _mm_unpackhi_epi32(a4, a5); |
| 70 | b4 = _mm_unpacklo_epi32(a2, a3); |
| 71 | b5 = _mm_unpacklo_epi32(a6, a7); |
| 72 | b6 = _mm_unpackhi_epi32(a2, a3); |
| 73 | b7 = _mm_unpackhi_epi32(a6, a7); |
| 74 | |
| 75 | in[0] = _mm_unpacklo_epi64(b0, b1); |
| 76 | in[1] = _mm_unpackhi_epi64(b0, b1); |
| 77 | in[2] = _mm_unpacklo_epi64(b2, b3); |
| 78 | in[3] = _mm_unpackhi_epi64(b2, b3); |
| 79 | in[4] = _mm_unpacklo_epi64(b4, b5); |
| 80 | in[5] = _mm_unpackhi_epi64(b4, b5); |
| 81 | in[6] = _mm_unpacklo_epi64(b6, b7); |
| 82 | in[7] = _mm_unpackhi_epi64(b6, b7); |
| 83 | } else { |
| 84 | in[0] = _mm_add_epi16(a0, a4); |
| 85 | in[7] = _mm_add_epi16(a1, a5); |
| 86 | in[3] = _mm_add_epi16(a2, a6); |
| 87 | in[4] = _mm_add_epi16(a3, a7); |
| 88 | in[2] = _mm_sub_epi16(a0, a4); |
| 89 | in[6] = _mm_sub_epi16(a1, a5); |
| 90 | in[1] = _mm_sub_epi16(a2, a6); |
| 91 | in[5] = _mm_sub_epi16(a3, a7); |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, |
| 96 | ptrdiff_t src_stride, tran_low_t *coeff, |
| 97 | int is_final) { |
| 98 | __m128i src[8]; |
| 99 | src[0] = _mm_load_si128((const __m128i *)src_diff); |
| 100 | src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 101 | src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 102 | src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 103 | src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 104 | src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 105 | src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 106 | src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); |
| 107 | |
| 108 | hadamard_col8_sse2(src, 0); |
| 109 | hadamard_col8_sse2(src, 1); |
| 110 | |
| 111 | if (is_final) { |
| 112 | store_tran_low(src[0], coeff); |
| 113 | coeff += 8; |
| 114 | store_tran_low(src[1], coeff); |
| 115 | coeff += 8; |
| 116 | store_tran_low(src[2], coeff); |
| 117 | coeff += 8; |
| 118 | store_tran_low(src[3], coeff); |
| 119 | coeff += 8; |
| 120 | store_tran_low(src[4], coeff); |
| 121 | coeff += 8; |
| 122 | store_tran_low(src[5], coeff); |
| 123 | coeff += 8; |
| 124 | store_tran_low(src[6], coeff); |
| 125 | coeff += 8; |
| 126 | store_tran_low(src[7], coeff); |
| 127 | } else { |
| 128 | int16_t *coeff16 = (int16_t *)coeff; |
| 129 | _mm_store_si128((__m128i *)coeff16, src[0]); |
| 130 | coeff16 += 8; |
| 131 | _mm_store_si128((__m128i *)coeff16, src[1]); |
| 132 | coeff16 += 8; |
| 133 | _mm_store_si128((__m128i *)coeff16, src[2]); |
| 134 | coeff16 += 8; |
| 135 | _mm_store_si128((__m128i *)coeff16, src[3]); |
| 136 | coeff16 += 8; |
| 137 | _mm_store_si128((__m128i *)coeff16, src[4]); |
| 138 | coeff16 += 8; |
| 139 | _mm_store_si128((__m128i *)coeff16, src[5]); |
| 140 | coeff16 += 8; |
| 141 | _mm_store_si128((__m128i *)coeff16, src[6]); |
| 142 | coeff16 += 8; |
| 143 | _mm_store_si128((__m128i *)coeff16, src[7]); |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
| 148 | tran_low_t *coeff) { |
| 149 | hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); |
| 150 | } |
| 151 | |
| 152 | static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, |
| 153 | ptrdiff_t src_stride, tran_low_t *coeff, |
| 154 | int is_final) { |
| 155 | // For high bitdepths, it is unnecessary to store_tran_low |
| 156 | // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the |
| 157 | // next stage. Output to an intermediate buffer first, then store_tran_low() |
| 158 | // in the final stage. |
| 159 | DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); |
| 160 | int16_t *t_coeff = temp_coeff; |
| 161 | int16_t *coeff16 = (int16_t *)coeff; |
| 162 | int idx; |
| 163 | for (idx = 0; idx < 4; ++idx) { |
| 164 | const int16_t *src_ptr = |
| 165 | src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; |
| 166 | hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), |
| 167 | 0); |
| 168 | } |
| 169 | |
| 170 | for (idx = 0; idx < 64; idx += 8) { |
| 171 | __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); |
| 172 | __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); |
| 173 | __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); |
| 174 | __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); |
| 175 | |
| 176 | __m128i b0 = _mm_add_epi16(coeff0, coeff1); |
| 177 | __m128i b1 = _mm_sub_epi16(coeff0, coeff1); |
| 178 | __m128i b2 = _mm_add_epi16(coeff2, coeff3); |
| 179 | __m128i b3 = _mm_sub_epi16(coeff2, coeff3); |
| 180 | |
| 181 | b0 = _mm_srai_epi16(b0, 1); |
| 182 | b1 = _mm_srai_epi16(b1, 1); |
| 183 | b2 = _mm_srai_epi16(b2, 1); |
| 184 | b3 = _mm_srai_epi16(b3, 1); |
| 185 | |
| 186 | coeff0 = _mm_add_epi16(b0, b2); |
| 187 | coeff1 = _mm_add_epi16(b1, b3); |
| 188 | coeff2 = _mm_sub_epi16(b0, b2); |
| 189 | coeff3 = _mm_sub_epi16(b1, b3); |
| 190 | |
| 191 | if (is_final) { |
| 192 | store_tran_low(coeff0, coeff); |
| 193 | store_tran_low(coeff1, coeff + 64); |
| 194 | store_tran_low(coeff2, coeff + 128); |
| 195 | store_tran_low(coeff3, coeff + 192); |
| 196 | coeff += 8; |
| 197 | } else { |
| 198 | _mm_store_si128((__m128i *)coeff16, coeff0); |
| 199 | _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); |
| 200 | _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); |
| 201 | _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); |
| 202 | coeff16 += 8; |
| 203 | } |
| 204 | |
| 205 | t_coeff += 8; |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
| 210 | tran_low_t *coeff) { |
| 211 | hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); |
| 212 | } |
| 213 | |
| 214 | void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, |
| 215 | tran_low_t *coeff) { |
| 216 | // For high bitdepths, it is unnecessary to store_tran_low |
| 217 | // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the |
| 218 | // next stage. Output to an intermediate buffer first, then store_tran_low() |
| 219 | // in the final stage. |
| 220 | DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); |
| 221 | int16_t *t_coeff = temp_coeff; |
| 222 | int idx; |
| 223 | for (idx = 0; idx < 4; ++idx) { |
| 224 | const int16_t *src_ptr = |
| 225 | src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; |
| 226 | hadamard_16x16_sse2(src_ptr, src_stride, |
| 227 | (tran_low_t *)(t_coeff + idx * 256), 0); |
| 228 | } |
| 229 | |
| 230 | for (idx = 0; idx < 256; idx += 8) { |
| 231 | __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); |
| 232 | __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); |
| 233 | __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); |
| 234 | __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); |
| 235 | |
| 236 | __m128i b0 = _mm_add_epi16(coeff0, coeff1); |
| 237 | __m128i b1 = _mm_sub_epi16(coeff0, coeff1); |
| 238 | __m128i b2 = _mm_add_epi16(coeff2, coeff3); |
| 239 | __m128i b3 = _mm_sub_epi16(coeff2, coeff3); |
| 240 | |
| 241 | b0 = _mm_srai_epi16(b0, 2); |
| 242 | b1 = _mm_srai_epi16(b1, 2); |
| 243 | b2 = _mm_srai_epi16(b2, 2); |
| 244 | b3 = _mm_srai_epi16(b3, 2); |
| 245 | |
| 246 | coeff0 = _mm_add_epi16(b0, b2); |
| 247 | coeff1 = _mm_add_epi16(b1, b3); |
| 248 | store_tran_low(coeff0, coeff); |
| 249 | store_tran_low(coeff1, coeff + 256); |
| 250 | |
| 251 | coeff2 = _mm_sub_epi16(b0, b2); |
| 252 | coeff3 = _mm_sub_epi16(b1, b3); |
| 253 | store_tran_low(coeff2, coeff + 512); |
| 254 | store_tran_low(coeff3, coeff + 768); |
| 255 | |
| 256 | coeff += 8; |
| 257 | t_coeff += 8; |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | int aom_satd_sse2(const tran_low_t *coeff, int length) { |
| 262 | int i; |
| 263 | const __m128i zero = _mm_setzero_si128(); |
| 264 | __m128i accum = zero; |
| 265 | |
| 266 | for (i = 0; i < length; i += 8) { |
| 267 | const __m128i src_line = load_tran_low(coeff); |
| 268 | const __m128i inv = _mm_sub_epi16(zero, src_line); |
| 269 | const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) |
| 270 | const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); |
| 271 | const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); |
| 272 | const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); |
| 273 | accum = _mm_add_epi32(accum, sum); |
| 274 | coeff += 8; |
| 275 | } |
| 276 | |
| 277 | { // cascading summation of accum |
| 278 | __m128i hi = _mm_srli_si128(accum, 8); |
| 279 | accum = _mm_add_epi32(accum, hi); |
| 280 | hi = _mm_srli_epi64(accum, 32); |
| 281 | accum = _mm_add_epi32(accum, hi); |
| 282 | } |
| 283 | |
| 284 | return _mm_cvtsi128_si32(accum); |
| 285 | } |