Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2024, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 3-Clause Clear License |
| 5 | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| 6 | * License was not distributed with this source code in the LICENSE file, you |
| 7 | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| 8 | * Alliance for Open Media Patent License 1.0 was not distributed with this |
| 9 | * source code in the PATENTS file, you can obtain it at |
| 10 | * aomedia.org/license/patent-license/. |
| 11 | */ |
| 12 | |
| 13 | #include <assert.h> |
| 14 | #include <immintrin.h> /* AVX2 */ |
| 15 | |
| 16 | #include "aom/aom_integer.h" |
| 17 | #include "aom_dsp/x86/mem_sse2.h" |
| 18 | #include "av1/common/av1_common_int.h" |
| 19 | #include "av1/common/quant_common.h" |
| 20 | #include "av1/encoder/trellis_quant.h" |
| 21 | #include "aom_dsp/x86/synonyms.h" |
| 22 | #include "aom_dsp/x86/synonyms_avx2.h" |
| 23 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 24 | // av1_decide_states_*() constants. |
| 25 | static const int32_t kShuffle[8] = { 0, 2, 1, 3, 5, 7, 4, 6 }; |
| 26 | static const int32_t kPrevId[TCQ_MAX_STATES / 4][8] = { |
| 27 | { 0, 0 << 24, 0, 1 << 24, 0, 2 << 24, 0, 3 << 24 }, |
| 28 | { 0, 4 << 24, 0, 5 << 24, 0, 6 << 24, 0, 7 << 24 }, |
| 29 | }; |
| 30 | |
| 31 | // av1_calc_lf_ctx_*() constants. |
| 32 | // Neighbor mask for calculating context sum (base/mid). |
| 33 | #define M MAX_VAL_BR_CTX |
| 34 | static const int8_t kNbrMask[4][32] = { |
| 35 | { 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // diag 0 |
| 36 | M, M, 0, M, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
| 37 | { 0, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, // diag 1 |
| 38 | 0, M, M, 0, 0, M, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, |
| 39 | { 0, 0, 5, 5, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, // diag 2 |
| 40 | 0, 0, M, M, 0, 0, 0, M, 0, 0, 0, 0, 0, 0, 0, 0 }, |
| 41 | { 0, 0, 0, 5, 5, 0, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, // diag 3 |
| 42 | 0, 0, 0, M, M, 0, 0, 0, 0, M, 0, 0, 0, 0, 0, 0 }, |
| 43 | }; |
| 44 | static const int8_t kMaxCtx[16] = { 8, 6, 6, 4, 4, 4, 4, 4, |
| 45 | 4, 4, 4, 4, 4, 4, 4, 4 }; |
| 46 | static const int8_t kScanDiag[MAX_LF_SCAN] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; |
| 47 | |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 48 | void av1_decide_states_avx2(const struct tcq_node_t *prev, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 49 | const struct tcq_rate_t *rd, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 50 | const struct prequant_t *pq, int n_states, |
| 51 | int limits, int try_eob, int64_t rdmult, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 52 | struct tcq_node_t *decision) { |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 53 | (void)limits; |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 54 | assert((rdmult >> 32) == 0); |
| 55 | assert(sizeof(tcq_node_t) == 16); |
| 56 | |
| 57 | __m256i c_rdmult = _mm256_set1_epi64x(rdmult); |
| 58 | __m256i c_round = _mm256_set1_epi64x(1 << (AV1_PROB_COST_SHIFT - 1)); |
| 59 | __m256i c_zero = _mm256_setzero_si256(); |
| 60 | |
| 61 | // Gather absolute coeff level for 4 possible quant options. |
| 62 | __m128i abslev0123 = _mm_lddqu_si128((__m128i *)pq->absLevel); |
| 63 | __m256i abslev0231 = |
| 64 | _mm256_castsi128_si256(_mm_shuffle_epi32(abslev0123, 0x78)); |
| 65 | __m256i abslev02023131 = _mm256_permute4x64_epi64(abslev0231, 0x50); |
| 66 | __m256i abslev00223311 = _mm256_shuffle_epi32(abslev02023131, 0x50); |
| 67 | __m256i abslev0033 = _mm256_unpacklo_epi32(c_zero, abslev00223311); |
| 68 | __m256i abslev2211 = _mm256_unpackhi_epi32(c_zero, abslev00223311); |
| 69 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 70 | __m256i *out_a = (__m256i *)&decision[0]; |
| 71 | __m256i *out_b = (__m256i *)&decision[n_states >> 1]; |
| 72 | |
| 73 | for (int i = 0; i < n_states >> 2; i++) { |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 74 | // Load distortion. |
| 75 | __m256i dist = _mm256_lddqu_si256((__m256i *)&pq->deltaDist[0]); |
| 76 | dist = _mm256_slli_epi64(dist, RDDIV_BITS); |
| 77 | __m256i dist0033 = _mm256_permute4x64_epi64(dist, 0xF0); |
| 78 | __m256i dist2211 = _mm256_permute4x64_epi64(dist, 0x5A); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 79 | |
| 80 | // Calc rate-distortion costs for each pair of even/odd quant. |
| 81 | // Separate candidates into even and odd quant decisions |
| 82 | // Even indexes: { 0, 2, 5, 7 }. Odd: { 1, 3, 4, 6 }. |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 83 | __m256i rates = _mm256_lddqu_si256((__m256i *)&rd->rate[8 * i]); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 84 | __m256i permute_mask = _mm256_lddqu_si256((__m256i *)kShuffle); |
| 85 | __m256i rate02135746 = _mm256_permutevar8x32_epi32(rates, permute_mask); |
| 86 | __m256i rate0257 = _mm256_unpacklo_epi32(rate02135746, c_zero); |
| 87 | __m256i rate1346 = _mm256_unpackhi_epi32(rate02135746, c_zero); |
| 88 | __m256i rdcost0257 = _mm256_mul_epu32(c_rdmult, rate0257); |
| 89 | __m256i rdcost1346 = _mm256_mul_epu32(c_rdmult, rate1346); |
| 90 | rdcost0257 = _mm256_add_epi64(rdcost0257, c_round); |
| 91 | rdcost1346 = _mm256_add_epi64(rdcost1346, c_round); |
| 92 | rdcost0257 = _mm256_srli_epi64(rdcost0257, AV1_PROB_COST_SHIFT); |
| 93 | rdcost1346 = _mm256_srli_epi64(rdcost1346, AV1_PROB_COST_SHIFT); |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 94 | rdcost0257 = _mm256_add_epi64(rdcost0257, dist0033); |
| 95 | rdcost1346 = _mm256_add_epi64(rdcost1346, dist2211); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 96 | |
| 97 | // Calc rd-cost for zero quant. |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 98 | __m256i ratezero = _mm256_castsi128_si256( |
| 99 | _mm_lddqu_si128((__m128i *)&rd->rate_zero[4 * i])); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 100 | ratezero = _mm256_permute4x64_epi64(ratezero, 0x50); |
| 101 | ratezero = _mm256_unpacklo_epi32(ratezero, c_zero); |
| 102 | __m256i rdcostzero = _mm256_mul_epu32(c_rdmult, ratezero); |
| 103 | rdcostzero = _mm256_add_epi64(rdcostzero, c_round); |
| 104 | rdcostzero = _mm256_srli_epi64(rdcostzero, AV1_PROB_COST_SHIFT); |
| 105 | |
| 106 | // Add previous state rdCost to rdcostzero |
| 107 | __m256i state01 = _mm256_lddqu_si256((__m256i *)&prev[4 * i]); |
| 108 | __m256i state23 = _mm256_lddqu_si256((__m256i *)&prev[4 * i + 2]); |
| 109 | __m256i state02 = _mm256_permute2x128_si256(state01, state23, 0x20); |
| 110 | __m256i state13 = _mm256_permute2x128_si256(state01, state23, 0x31); |
| 111 | __m256i prevrd0123 = _mm256_unpacklo_epi64(state02, state13); |
| 112 | __m256i prevrate0123 = _mm256_unpackhi_epi64(state02, state13); |
| 113 | prevrate0123 = _mm256_slli_epi64(prevrate0123, 32); |
| 114 | prevrate0123 = _mm256_srli_epi64(prevrate0123, 32); |
| 115 | |
| 116 | // Compare rd costs (Zero vs Even). |
| 117 | __m256i use_zero = _mm256_cmpgt_epi64(rdcost0257, rdcostzero); |
| 118 | rdcost0257 = _mm256_blendv_epi8(rdcost0257, rdcostzero, use_zero); |
| 119 | rate0257 = _mm256_blendv_epi8(rate0257, ratezero, use_zero); |
| 120 | __m256i abslev_even = _mm256_andnot_si256(use_zero, abslev0033); |
| 121 | |
| 122 | // Add previous state rdCost to current rdcost |
| 123 | rdcost0257 = _mm256_add_epi64(rdcost0257, prevrd0123); |
| 124 | rdcost1346 = _mm256_add_epi64(rdcost1346, prevrd0123); |
| 125 | rate0257 = _mm256_add_epi64(rate0257, prevrate0123); |
| 126 | rate1346 = _mm256_add_epi64(rate1346, prevrate0123); |
| 127 | |
| 128 | // Compare rd costs (Even vs Odd). |
| 129 | __m256i rdcost3164 = _mm256_shuffle_epi32(rdcost1346, 0x4E); |
| 130 | __m256i rate3164 = _mm256_shuffle_epi32(rate1346, 0x4E); |
| 131 | __m256i use_odd = _mm256_cmpgt_epi64(rdcost0257, rdcost3164); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 132 | __m256i use_odd_1 = _mm256_slli_epi64(_mm256_srli_epi64(use_odd, 63), 56); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 133 | __m256i prev_id = _mm256_lddqu_si256((__m256i *)kPrevId[i]); |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 134 | prev_id = _mm256_xor_si256(prev_id, use_odd_1); |
| 135 | __m256i rdcost_best = _mm256_blendv_epi8(rdcost0257, rdcost3164, use_odd); |
| 136 | __m256i rate_best = _mm256_blendv_epi8(rate0257, rate3164, use_odd); |
| 137 | __m256i abslev_best = _mm256_blendv_epi8(abslev_even, abslev2211, use_odd); |
| 138 | |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 139 | // Compare rd costs (best vs new eob). |
| 140 | __m256i rate_eob = _mm256_castsi128_si256(_mm_loadu_si64(rd->rate_eob)); |
| 141 | rate_eob = _mm256_unpacklo_epi32(rate_eob, c_zero); |
| 142 | __m256i rdcost_eob = _mm256_mul_epu32(c_rdmult, rate_eob); |
| 143 | rdcost_eob = _mm256_add_epi64(rdcost_eob, c_round); |
| 144 | rdcost_eob = _mm256_srli_epi64(rdcost_eob, AV1_PROB_COST_SHIFT); |
| 145 | __m256i dist_eob = _mm256_unpacklo_epi64(dist0033, dist2211); |
| 146 | rdcost_eob = _mm256_add_epi64(rdcost_eob, dist_eob); |
| 147 | __m128i mask_eob0 = _mm_set1_epi64x((int64_t)-try_eob); |
| 148 | __m256i mask_eob = _mm256_inserti128_si256(c_zero, mask_eob0, 0); |
| 149 | __m256i use_eob = _mm256_cmpgt_epi64(rdcost_best, rdcost_eob); |
| 150 | use_eob = _mm256_and_si256(use_eob, mask_eob); |
| 151 | __m256i use_eob_1 = _mm256_slli_epi64(use_eob, 56); |
| 152 | prev_id = _mm256_or_si256(prev_id, use_eob_1); |
| 153 | rdcost_best = _mm256_blendv_epi8(rdcost_best, rdcost_eob, use_eob); |
| 154 | rate_best = _mm256_blendv_epi8(rate_best, rate_eob, use_eob); |
| 155 | __m256i abslev_eob = _mm256_unpacklo_epi64(abslev0033, abslev2211); |
| 156 | abslev_best = _mm256_blendv_epi8(abslev_best, abslev_eob, use_eob); |
| 157 | try_eob = 0; |
| 158 | |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 159 | // Pack and store state info. |
| 160 | __m256i info_best = _mm256_or_si256(rate_best, abslev_best); |
| 161 | info_best = _mm256_or_si256(info_best, prev_id); |
| 162 | __m256i info01 = _mm256_unpacklo_epi64(rdcost_best, info_best); |
| 163 | __m256i info23 = _mm256_unpackhi_epi64(rdcost_best, info_best); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 164 | _mm256_storeu_si256(out_a, info01); |
| 165 | _mm256_storeu_si256(out_b, info23); |
| 166 | out_a = (__m256i *)&decision[6]; |
| 167 | out_b = (__m256i *)&decision[2]; |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 168 | } |
| 169 | } |
| 170 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 171 | void av1_decide_states_st4_avx2(const struct tcq_node_t *prev, |
| 172 | const struct tcq_rate_t *rd, |
| 173 | const struct prequant_t *pq, int n_states, |
| 174 | int limits, int try_eob, int64_t rdmult, |
| 175 | struct tcq_node_t *decision) { |
| 176 | (void)limits; |
| 177 | (void)n_states; |
| 178 | assert(n_states == 4); |
| 179 | assert((rdmult >> 32) == 0); |
| 180 | assert(sizeof(tcq_node_t) == 16); |
| 181 | |
| 182 | int i = 0; |
| 183 | |
| 184 | __m256i c_rdmult = _mm256_set1_epi64x(rdmult); |
| 185 | __m256i c_round = _mm256_set1_epi64x(1 << (AV1_PROB_COST_SHIFT - 1)); |
| 186 | __m256i c_zero = _mm256_setzero_si256(); |
| 187 | |
| 188 | // Gather absolute coeff level for 4 possible quant options. |
| 189 | __m128i abslev0123 = _mm_lddqu_si128((__m128i *)pq->absLevel); |
| 190 | __m256i abslev0231 = |
| 191 | _mm256_castsi128_si256(_mm_shuffle_epi32(abslev0123, 0x78)); |
| 192 | __m256i abslev02023131 = _mm256_permute4x64_epi64(abslev0231, 0x50); |
| 193 | __m256i abslev00223311 = _mm256_shuffle_epi32(abslev02023131, 0x50); |
| 194 | __m256i abslev0033 = _mm256_unpacklo_epi32(c_zero, abslev00223311); |
| 195 | __m256i abslev2211 = _mm256_unpackhi_epi32(c_zero, abslev00223311); |
| 196 | |
| 197 | // Load distortion. |
| 198 | __m256i dist = _mm256_lddqu_si256((__m256i *)&pq->deltaDist[0]); |
| 199 | dist = _mm256_slli_epi64(dist, RDDIV_BITS); |
| 200 | __m256i dist0033 = _mm256_permute4x64_epi64(dist, 0xF0); |
| 201 | __m256i dist2211 = _mm256_permute4x64_epi64(dist, 0x5A); |
| 202 | |
| 203 | // Calc rate-distortion costs for each pair of even/odd quant. |
| 204 | // Separate candidates into even and odd quant decisions |
| 205 | // Even indexes: { 0, 2, 5, 7 }. Odd: { 1, 3, 4, 6 }. |
| 206 | __m256i rates = _mm256_lddqu_si256((__m256i *)&rd->rate[8 * i]); |
| 207 | __m256i permute_mask = _mm256_lddqu_si256((__m256i *)kShuffle); |
| 208 | __m256i rate02135746 = _mm256_permutevar8x32_epi32(rates, permute_mask); |
| 209 | __m256i rate0257 = _mm256_unpacklo_epi32(rate02135746, c_zero); |
| 210 | __m256i rate1346 = _mm256_unpackhi_epi32(rate02135746, c_zero); |
| 211 | __m256i rdcost0257 = _mm256_mul_epu32(c_rdmult, rate0257); |
| 212 | __m256i rdcost1346 = _mm256_mul_epu32(c_rdmult, rate1346); |
| 213 | rdcost0257 = _mm256_add_epi64(rdcost0257, c_round); |
| 214 | rdcost1346 = _mm256_add_epi64(rdcost1346, c_round); |
| 215 | rdcost0257 = _mm256_srli_epi64(rdcost0257, AV1_PROB_COST_SHIFT); |
| 216 | rdcost1346 = _mm256_srli_epi64(rdcost1346, AV1_PROB_COST_SHIFT); |
| 217 | rdcost0257 = _mm256_add_epi64(rdcost0257, dist0033); |
| 218 | rdcost1346 = _mm256_add_epi64(rdcost1346, dist2211); |
| 219 | |
| 220 | // Calc rd-cost for zero quant. |
| 221 | __m256i ratezero = |
| 222 | _mm256_castsi128_si256(_mm_lddqu_si128((__m128i *)&rd->rate_zero[4 * i])); |
| 223 | ratezero = _mm256_permute4x64_epi64(ratezero, 0x50); |
| 224 | ratezero = _mm256_unpacklo_epi32(ratezero, c_zero); |
| 225 | __m256i rdcostzero = _mm256_mul_epu32(c_rdmult, ratezero); |
| 226 | rdcostzero = _mm256_add_epi64(rdcostzero, c_round); |
| 227 | rdcostzero = _mm256_srli_epi64(rdcostzero, AV1_PROB_COST_SHIFT); |
| 228 | |
| 229 | // Add previous state rdCost to rdcostzero |
| 230 | __m256i state01 = _mm256_lddqu_si256((__m256i *)&prev[4 * i]); |
| 231 | __m256i state23 = _mm256_lddqu_si256((__m256i *)&prev[4 * i + 2]); |
| 232 | __m256i state02 = _mm256_permute2x128_si256(state01, state23, 0x20); |
| 233 | __m256i state13 = _mm256_permute2x128_si256(state01, state23, 0x31); |
| 234 | __m256i prevrd0123 = _mm256_unpacklo_epi64(state02, state13); |
| 235 | __m256i prevrate0123 = _mm256_unpackhi_epi64(state02, state13); |
| 236 | prevrate0123 = _mm256_slli_epi64(prevrate0123, 32); |
| 237 | prevrate0123 = _mm256_srli_epi64(prevrate0123, 32); |
| 238 | |
| 239 | // Compare rd costs (Zero vs Even). |
| 240 | __m256i use_zero = _mm256_cmpgt_epi64(rdcost0257, rdcostzero); |
| 241 | rdcost0257 = _mm256_blendv_epi8(rdcost0257, rdcostzero, use_zero); |
| 242 | rate0257 = _mm256_blendv_epi8(rate0257, ratezero, use_zero); |
| 243 | __m256i abslev_even = _mm256_andnot_si256(use_zero, abslev0033); |
| 244 | |
| 245 | // Add previous state rdCost to current rdcost |
| 246 | rdcost0257 = _mm256_add_epi64(rdcost0257, prevrd0123); |
| 247 | rdcost1346 = _mm256_add_epi64(rdcost1346, prevrd0123); |
| 248 | rate0257 = _mm256_add_epi64(rate0257, prevrate0123); |
| 249 | rate1346 = _mm256_add_epi64(rate1346, prevrate0123); |
| 250 | |
| 251 | // Compare rd costs (Even vs Odd). |
| 252 | __m256i rdcost3164 = _mm256_shuffle_epi32(rdcost1346, 0x4E); |
| 253 | __m256i rate3164 = _mm256_shuffle_epi32(rate1346, 0x4E); |
| 254 | __m256i use_odd = _mm256_cmpgt_epi64(rdcost0257, rdcost3164); |
| 255 | __m256i use_odd_1 = _mm256_slli_epi64(_mm256_srli_epi64(use_odd, 63), 56); |
| 256 | __m256i prev_id = _mm256_lddqu_si256((__m256i *)kPrevId[i]); |
| 257 | prev_id = _mm256_xor_si256(prev_id, use_odd_1); |
| 258 | __m256i rdcost_best = _mm256_blendv_epi8(rdcost0257, rdcost3164, use_odd); |
| 259 | __m256i rate_best = _mm256_blendv_epi8(rate0257, rate3164, use_odd); |
| 260 | __m256i abslev_best = _mm256_blendv_epi8(abslev_even, abslev2211, use_odd); |
| 261 | |
| 262 | // Compare rd costs (best vs new eob). |
| 263 | __m256i rate_eob = _mm256_castsi128_si256(_mm_loadu_si64(rd->rate_eob)); |
| 264 | rate_eob = _mm256_unpacklo_epi32(rate_eob, c_zero); |
| 265 | __m256i rdcost_eob = _mm256_mul_epu32(c_rdmult, rate_eob); |
| 266 | rdcost_eob = _mm256_add_epi64(rdcost_eob, c_round); |
| 267 | rdcost_eob = _mm256_srli_epi64(rdcost_eob, AV1_PROB_COST_SHIFT); |
| 268 | __m256i dist_eob = _mm256_unpacklo_epi64(dist0033, dist2211); |
| 269 | rdcost_eob = _mm256_add_epi64(rdcost_eob, dist_eob); |
| 270 | __m128i mask_eob0 = _mm_set1_epi64x((int64_t)-try_eob); |
| 271 | __m256i mask_eob = _mm256_inserti128_si256(c_zero, mask_eob0, 0); |
| 272 | __m256i use_eob = _mm256_cmpgt_epi64(rdcost_best, rdcost_eob); |
| 273 | use_eob = _mm256_and_si256(use_eob, mask_eob); |
| 274 | __m256i use_eob_1 = _mm256_slli_epi64(use_eob, 56); |
| 275 | prev_id = _mm256_or_si256(prev_id, use_eob_1); |
| 276 | rdcost_best = _mm256_blendv_epi8(rdcost_best, rdcost_eob, use_eob); |
| 277 | rate_best = _mm256_blendv_epi8(rate_best, rate_eob, use_eob); |
| 278 | __m256i abslev_eob = _mm256_unpacklo_epi64(abslev0033, abslev2211); |
| 279 | abslev_best = _mm256_blendv_epi8(abslev_best, abslev_eob, use_eob); |
| 280 | |
| 281 | // Pack and store state info. |
| 282 | __m256i info_best = _mm256_or_si256(rate_best, abslev_best); |
| 283 | info_best = _mm256_or_si256(info_best, prev_id); |
| 284 | __m256i info01 = _mm256_unpacklo_epi64(rdcost_best, info_best); |
| 285 | __m256i info23 = _mm256_unpackhi_epi64(rdcost_best, info_best); |
| 286 | __m256i *out_a = (__m256i *)&decision[0]; |
| 287 | __m256i *out_b = (__m256i *)&decision[2]; |
| 288 | _mm256_storeu_si256(out_a, info01); |
| 289 | _mm256_storeu_si256(out_b, info23); |
| 290 | } |
| 291 | |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 292 | void av1_pre_quant_avx2(tran_low_t tqc, struct prequant_t *pqData, |
| 293 | const int32_t *quant_ptr, int dqv, int log_scale, |
| 294 | int scan_pos) { |
| 295 | static const int32_t kInc[4][4] = { |
| 296 | { 0, 1, 2, 3 }, { 3, 0, 1, 2 }, { 2, 3, 0, 1 }, { 1, 2, 3, 0 } |
| 297 | }; |
| 298 | |
| 299 | // calculate qIdx |
| 300 | int shift = 16 - log_scale + QUANT_FP_BITS; |
| 301 | int32_t add = -((2 << shift) >> 1); |
| 302 | int32_t abs_tqc = abs(tqc); |
| 303 | |
| 304 | int32_t qIdx = (int)AOMMAX( |
| 305 | 1, AOMMIN(((1 << 16) - 1), |
| 306 | ((int64_t)abs_tqc * quant_ptr[scan_pos != 0] + add) >> shift)); |
| 307 | pqData->qIdx = qIdx; |
| 308 | |
| 309 | __m256i c_zero = _mm256_setzero_si256(); |
| 310 | __m128i base_qc = _mm_set1_epi32(qIdx); |
| 311 | __m128i qc_inc = _mm_lddqu_si128((__m128i *)kInc[qIdx & 3]); |
| 312 | __m128i qc_idx = _mm_add_epi32(base_qc, qc_inc); |
| 313 | __m128i one = _mm_set1_epi32(1); |
| 314 | __m128i abslev = _mm_add_epi32(qc_idx, one); |
| 315 | abslev = _mm_srli_epi32(abslev, 1); |
| 316 | _mm_storeu_si128((__m128i *)pqData->absLevel, abslev); |
| 317 | |
| 318 | __m256i qc_idx1 = _mm256_castsi128_si256(qc_idx); |
| 319 | __m256i qc_idx_01012323 = _mm256_permute4x64_epi64(qc_idx1, 0x50); |
| 320 | __m256i qc_idx_0123 = _mm256_unpacklo_epi32(qc_idx_01012323, c_zero); |
| 321 | __m256i c_dqv = _mm256_set1_epi64x(dqv); |
| 322 | __m256i qc_mul_dqv = _mm256_mul_epu32(qc_idx_0123, c_dqv); |
| 323 | __m256i dq_round = _mm256_set1_epi64x(1 << (QUANT_TABLE_BITS - 1)); |
| 324 | __m256i qc_mul_dqv_rnd = _mm256_add_epi64(qc_mul_dqv, dq_round); |
| 325 | __m256i dq_shift = _mm256_set1_epi64x(log_scale + QUANT_TABLE_BITS); |
| 326 | __m256i dqc = _mm256_srlv_epi64(qc_mul_dqv_rnd, dq_shift); |
| 327 | |
| 328 | __m256i abs_tqc_sh = _mm256_set1_epi64x(abs_tqc << (log_scale - 1)); |
| 329 | __m256i dist0 = _mm256_mul_epi32(abs_tqc_sh, abs_tqc_sh); |
| 330 | __m256i scale_shift = _mm256_set1_epi64x(log_scale - 1); |
| 331 | __m256i dqc_sh = _mm256_sllv_epi32(dqc, scale_shift); |
| 332 | __m256i diff = _mm256_sub_epi32(dqc_sh, abs_tqc_sh); |
| 333 | __m256i dist = _mm256_mul_epi32(diff, diff); |
| 334 | dist = _mm256_sub_epi64(dist, dist0); |
| 335 | _mm256_storeu_si256((__m256i *)pqData->deltaDist, dist); |
| 336 | } |
| 337 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 338 | void av1_update_states_avx2(tcq_node_t *decision, int scan_idx, int n_states, |
Joe Young | 089e2f8 | 2024-08-23 13:51:27 -0700 | [diff] [blame] | 339 | const struct tcq_ctx_t *cur_ctx, |
| 340 | struct tcq_ctx_t *nxt_ctx) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 341 | for (int i = 0; i < n_states; i++) { |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 342 | int prevId = decision[i].prevId; |
| 343 | int absLevel = decision[i].absLevel; |
Joe Young | 089e2f8 | 2024-08-23 13:51:27 -0700 | [diff] [blame] | 344 | if (prevId >= 0) { |
| 345 | memcpy(&nxt_ctx[i], &cur_ctx[prevId], sizeof(tcq_ctx_t)); |
| 346 | } else { |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 347 | // New EOB; reset contexts |
Joe Young | 089e2f8 | 2024-08-23 13:51:27 -0700 | [diff] [blame] | 348 | memset(&nxt_ctx[i], 0, sizeof(tcq_ctx_t)); |
| 349 | nxt_ctx[i].orig_id = -1; |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 350 | } |
Joe Young | 089e2f8 | 2024-08-23 13:51:27 -0700 | [diff] [blame] | 351 | nxt_ctx[i].lev[scan_idx] = AOMMIN(absLevel, INT8_MAX); |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 352 | } |
| 353 | } |
| 354 | |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 355 | void av1_calc_diag_ctx_avx2(int scan_hi, int scan_lo, int bwl, |
| 356 | const uint8_t *prev_levels, const int16_t *scan, |
| 357 | uint8_t *ctx) { |
| 358 | #define M MAX_VAL_BR_CTX |
| 359 | static const int8_t kClip[2][16] = { |
| 360 | { 0, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 0, 0 }, |
| 361 | { 0, 0, M, 0, M, M, 0, 0, M, 0, M, M, 0, 0, 0, 0 }, |
| 362 | }; |
| 363 | #undef M |
| 364 | int n_ctx = scan_hi - scan_lo + 1; |
| 365 | __m128i zero = _mm_setzero_si128(); |
| 366 | __m128i one = _mm_set1_epi8(1); |
| 367 | __m128i four = _mm_set1_epi8(4); |
| 368 | __m128i six = _mm_set1_epi8(6); |
| 369 | __m128i clip = _mm_lddqu_si128((__m128i *)&kClip[0][0]); |
| 370 | __m128i clip_mid = _mm_lddqu_si128((__m128i *)&kClip[1][0]); |
| 371 | |
| 372 | int blk_pos = scan[scan_lo]; |
| 373 | int row_inc = (1 << bwl) + (1 << TX_PAD_HOR_LOG2) - 1; |
| 374 | const uint8_t *row_ptr = prev_levels + get_padded_idx(blk_pos, bwl) + 1; |
| 375 | const uint8_t *min_row_ptr = prev_levels; |
| 376 | __m128i nbr2 = _mm_loadu_si64(&row_ptr[row_inc]); |
| 377 | __m128i nbr3 = _mm_loadu_si64(&row_ptr[2 * row_inc]); |
| 378 | __m128i nbr23 = _mm_unpacklo_epi16(nbr2, nbr3); |
| 379 | |
| 380 | for (int i = 0; i < n_ctx; i += 2) { |
| 381 | const uint8_t *p1 = AOMMAX(min_row_ptr, &row_ptr[-row_inc]); |
| 382 | __m128i nbr0 = _mm_loadu_si64(p1); |
| 383 | __m128i nbr1 = _mm_loadu_si64(&row_ptr[0]); |
| 384 | __m128i nbr01 = _mm_unpacklo_epi16(nbr0, nbr1); |
| 385 | __m128i nbr0123 = _mm_unpacklo_epi32(nbr01, nbr23); |
| 386 | __m128i nbr = _mm_unpacklo_epi64(nbr0123, nbr0123); |
| 387 | __m128i nbr_max = _mm_min_epu8(nbr, clip); |
| 388 | __m128i sum = _mm_maddubs_epi16(nbr_max, one); |
| 389 | sum = _mm_hadd_epi16(sum, zero); |
| 390 | sum = _mm_hadd_epi16(sum, zero); |
| 391 | __m128i coeff_ctx = _mm_packs_epi16(sum, sum); |
| 392 | coeff_ctx = _mm_avg_epu8(coeff_ctx, zero); |
| 393 | coeff_ctx = _mm_min_epi8(coeff_ctx, four); |
| 394 | __m128i nbr_max_mid = _mm_min_epu8(nbr, clip_mid); |
| 395 | __m128i sum_mid = _mm_maddubs_epi16(nbr_max_mid, one); |
| 396 | sum_mid = _mm_hadd_epi16(sum_mid, zero); |
| 397 | sum_mid = _mm_hadd_epi16(sum_mid, zero); |
| 398 | __m128i coeff_mid_ctx = _mm_packs_epi16(sum_mid, sum_mid); |
| 399 | coeff_mid_ctx = _mm_avg_epu8(coeff_mid_ctx, zero); |
| 400 | coeff_mid_ctx = _mm_min_epi8(coeff_mid_ctx, six); |
| 401 | coeff_mid_ctx = _mm_slli_epi16(coeff_mid_ctx, 4); |
| 402 | coeff_ctx = _mm_add_epi8(coeff_ctx, coeff_mid_ctx); |
| 403 | uint16_t ctx01 = _mm_extract_epi16(coeff_ctx, 0); |
| 404 | uint16_t ctx1 = ctx01 >> 8; |
| 405 | ctx[i] = (uint8_t)ctx01; |
| 406 | ctx[i + 1] = ctx1; |
| 407 | row_ptr -= 2 * row_inc; |
| 408 | nbr23 = nbr01; |
| 409 | } |
| 410 | } |
| 411 | |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 412 | static INLINE int get_mid_cost_def(tran_low_t abs_qc, int coeff_ctx, |
| 413 | const LV_MAP_COEFF_COST *txb_costs, |
| 414 | int plane, int t_sign, int sign) { |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 415 | int cost = 0; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 416 | if (plane == AOM_PLANE_V) { |
| 417 | cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1); |
| 418 | } |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 419 | if (abs_qc > NUM_BASE_LEVELS) { |
| 420 | int mid_ctx = coeff_ctx >> 4; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 421 | if (plane == 0) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 422 | cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 423 | } else { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 424 | cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost_uv[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 425 | } |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 426 | } |
| 427 | return cost; |
| 428 | } |
| 429 | |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 430 | static INLINE int get_mid_cost_eob(int ci, int limits, int is_dc, |
| 431 | tran_low_t abs_qc, int sign, int dc_sign_ctx, |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 432 | const LV_MAP_COEFF_COST *txb_costs, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 433 | TX_CLASS tx_class, int32_t t_sign, |
| 434 | int plane) { |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 435 | int cost = 0; |
| 436 | const int dc_ph_group = 0; // PH disabled |
| 437 | |
| 438 | if (limits) { |
| 439 | if (is_dc) { |
| 440 | cost -= av1_cost_literal(1); |
| 441 | if (plane == AOM_PLANE_V) { |
| 442 | cost += txb_costs->v_dc_sign_cost[t_sign][dc_sign_ctx][sign]; |
| 443 | } else { |
| 444 | cost += txb_costs->dc_sign_cost[dc_ph_group][dc_sign_ctx][sign]; |
| 445 | } |
| 446 | } else { |
| 447 | if (plane == AOM_PLANE_V) { |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 448 | cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 449 | } |
| 450 | } |
| 451 | if (plane > 0) { |
| 452 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
| 453 | int br_ctx = get_br_ctx_lf_eob_chroma(ci, tx_class); |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 454 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[br_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 455 | } |
| 456 | } else { |
| 457 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
| 458 | int br_ctx = get_br_ctx_lf_eob(ci, tx_class); |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 459 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[br_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 460 | } |
| 461 | } |
| 462 | } else { |
| 463 | if (plane == AOM_PLANE_V) { |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 464 | cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 465 | } |
| 466 | if (plane > 0) { |
| 467 | if (abs_qc > NUM_BASE_LEVELS) { |
| 468 | int br_ctx = 0; /* get_br_ctx_eob_chroma */ |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 469 | cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost_uv[br_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 470 | } |
| 471 | } else { |
| 472 | if (abs_qc > NUM_BASE_LEVELS) { |
| 473 | int br_ctx = 0; /* get_br_ctx_eob */ |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 474 | cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost[br_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 475 | } |
| 476 | } |
| 477 | } |
| 478 | return cost; |
| 479 | } |
| 480 | |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 481 | static int get_mid_cost_lf_dc(int ci, tran_low_t abs_qc, int sign, |
| 482 | int coeff_ctx, int dc_sign_ctx, |
| 483 | const LV_MAP_COEFF_COST *txb_costs, |
| 484 | const int32_t *tmp_sign, int plane) { |
| 485 | int cost = 0; |
| 486 | int mid_ctx = coeff_ctx >> 4; |
| 487 | const int dc_ph_group = 0; // PH disabled |
| 488 | cost -= av1_cost_literal(1); // Remove previously added sign cost. |
| 489 | if (plane == AOM_PLANE_V) |
| 490 | cost += txb_costs->v_dc_sign_cost[tmp_sign[ci]][dc_sign_ctx][sign]; |
| 491 | else |
| 492 | cost += txb_costs->dc_sign_cost[dc_ph_group][dc_sign_ctx][sign]; |
| 493 | if (plane > 0) { |
| 494 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 495 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 496 | } |
| 497 | } else { |
| 498 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 499 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 500 | } |
| 501 | } |
| 502 | return cost; |
| 503 | } |
| 504 | |
| 505 | static int get_mid_cost_lf(tran_low_t abs_qc, int coeff_ctx, |
| 506 | const LV_MAP_COEFF_COST *txb_costs, int plane) { |
| 507 | int cost = 0; |
| 508 | int mid_ctx = coeff_ctx >> 4; |
| 509 | #if 1 |
| 510 | assert(plane == 0); |
| 511 | (void)plane; |
| 512 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 513 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 514 | } |
| 515 | #else |
| 516 | if (plane > 0) { |
| 517 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 518 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 519 | } |
| 520 | } else { |
| 521 | if (abs_qc > LF_NUM_BASE_LEVELS) { |
Joe Young | e051e84 | 2024-08-28 14:27:48 -0700 | [diff] [blame] | 522 | cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 523 | } |
| 524 | } |
| 525 | #endif |
| 526 | return cost; |
| 527 | } |
| 528 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 529 | void av1_get_rate_dist_def_luma_avx2(const struct LV_MAP_COEFF_COST *txb_costs, |
| 530 | const struct prequant_t *pq, |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 531 | const tcq_coeff_ctx_t *coeff_ctx, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 532 | int blk_pos, int bwl, TX_CLASS tx_class, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 533 | int diag_ctx, int eob_rate, int n_states, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 534 | struct tcq_rate_t *rd) { |
| 535 | (void)bwl; |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 536 | const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_zero; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 537 | const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 538 | txb_costs->base_cost_low_tbl; |
| 539 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 540 | txb_costs->base_eob_cost_tbl; |
| 541 | const tran_low_t *absLevel = pq->absLevel; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 542 | |
| 543 | // Calc zero coeff costs. |
| 544 | __m256i zero = _mm256_setzero_si256(); |
| 545 | __m256i cost_zero_dq0 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 546 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 547 | __m256i cost_zero_dq1 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 548 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 549 | |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 550 | __m256i coef_ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
| 551 | __m256i ctx16 = _mm256_unpacklo_epi8(coef_ctx, zero); |
| 552 | __m256i ctx = _mm256_shuffle_epi32(ctx16, 0xD8); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 553 | __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx, zero); |
| 554 | __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx, zero); |
| 555 | __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0); |
| 556 | __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1); |
| 557 | __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1); |
| 558 | _mm_storeu_si128((__m128i *)&rd->rate_zero[0], |
| 559 | _mm256_castsi256_si128(ratez_0123)); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 560 | __m256i ratez_4567 = _mm256_unpackhi_epi64(ratez_dq0, ratez_dq1); |
| 561 | _mm_storeu_si128((__m128i *)&rd->rate_zero[4], |
| 562 | _mm256_castsi256_si128(ratez_4567)); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 563 | |
| 564 | // Calc coeff_base rate. |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 565 | int idx = AOMMIN(pq->qIdx - 1, 4); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 566 | __m128i c_zero = _mm_setzero_si128(); |
| 567 | __m256i diag = _mm256_set1_epi16(diag_ctx); |
| 568 | __m256i base_ctx = _mm256_slli_epi16(ctx16, 12); |
| 569 | base_ctx = _mm256_srli_epi16(base_ctx, 12); |
| 570 | base_ctx = _mm256_add_epi16(base_ctx, diag); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 571 | for (int i = 0; i < (n_states >> 2); i++) { |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 572 | int ctx0 = _mm256_extract_epi16(base_ctx, 0); |
| 573 | int ctx1 = _mm256_extract_epi16(base_ctx, 1); |
| 574 | int ctx2 = _mm256_extract_epi16(base_ctx, 2); |
| 575 | int ctx3 = _mm256_extract_epi16(base_ctx, 3); |
| 576 | base_ctx = _mm256_bsrli_epi128(base_ctx, 8); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 577 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 578 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 579 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 580 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 581 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 582 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
| 583 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 584 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
| 585 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 586 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
| 587 | } |
| 588 | |
| 589 | // Calc coeff/eob cost. |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 590 | int eob_ctx = coeff_ctx->coef_eob; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 591 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 592 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 593 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 594 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 595 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 596 | |
| 597 | // Calc coeff mid and high range cost. |
| 598 | if (idx > 0) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 599 | for (int i = 0; i < n_states; i++) { |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 600 | int a0 = i & 2 ? 1 : 0; |
| 601 | int a1 = a0 + 2; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 602 | int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i], |
| 603 | txb_costs, 0, 0, 0); |
| 604 | int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i], |
| 605 | txb_costs, 0, 0, 0); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 606 | rd->rate[2 * i] += mid_cost0; |
| 607 | rd->rate[2 * i + 1] += mid_cost1; |
| 608 | } |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 609 | int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], 0, 0, |
| 610 | txb_costs, tx_class, 0, 0); |
| 611 | int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], 0, 0, |
| 612 | txb_costs, tx_class, 0, 0); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 613 | rd->rate_eob[0] += eob_mid_cost0; |
| 614 | rd->rate_eob[1] += eob_mid_cost1; |
| 615 | } |
| 616 | } |
| 617 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 618 | void av1_get_rate_dist_def_luma_st4_avx2( |
| 619 | const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq, |
| 620 | const tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int bwl, TX_CLASS tx_class, |
| 621 | int diag_ctx, int eob_rate, int n_states, struct tcq_rate_t *rd) { |
| 622 | (void)bwl; |
| 623 | assert(n_states == 4); |
| 624 | n_states = 4; |
| 625 | |
| 626 | const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_zero; |
| 627 | const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 628 | txb_costs->base_cost_low_tbl; |
| 629 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 630 | txb_costs->base_eob_cost_tbl; |
| 631 | const tran_low_t *absLevel = pq->absLevel; |
| 632 | |
| 633 | // Calc zero coeff costs. |
| 634 | __m256i zero = _mm256_setzero_si256(); |
| 635 | __m256i cost_zero_dq0 = |
| 636 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
| 637 | __m256i cost_zero_dq1 = |
| 638 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
| 639 | |
| 640 | __m256i coef_ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
| 641 | __m256i ctx16 = _mm256_unpacklo_epi8(coef_ctx, zero); |
| 642 | __m256i ctx = _mm256_shuffle_epi32(ctx16, 0xD8); |
| 643 | __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx, zero); |
| 644 | __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx, zero); |
| 645 | __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0); |
| 646 | __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1); |
| 647 | __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1); |
| 648 | _mm_storeu_si128((__m128i *)&rd->rate_zero[0], |
| 649 | _mm256_castsi256_si128(ratez_0123)); |
| 650 | |
| 651 | // Calc coeff_base rate. |
| 652 | int idx = AOMMIN(pq->qIdx - 1, 4); |
| 653 | __m128i c_zero = _mm_setzero_si128(); |
| 654 | __m256i diag = _mm256_set1_epi16(diag_ctx); |
| 655 | __m256i base_ctx = _mm256_slli_epi16(ctx16, 12); |
| 656 | base_ctx = _mm256_srli_epi16(base_ctx, 12); |
| 657 | base_ctx = _mm256_add_epi16(base_ctx, diag); |
| 658 | for (int i = 0; i < (n_states >> 2); i++) { |
| 659 | int ctx0 = _mm256_extract_epi16(base_ctx, 0); |
| 660 | int ctx1 = _mm256_extract_epi16(base_ctx, 1); |
| 661 | int ctx2 = _mm256_extract_epi16(base_ctx, 2); |
| 662 | int ctx3 = _mm256_extract_epi16(base_ctx, 3); |
| 663 | base_ctx = _mm256_bsrli_epi128(base_ctx, 8); |
| 664 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 665 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 666 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 667 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 668 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 669 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
| 670 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 671 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
| 672 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 673 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
| 674 | } |
| 675 | |
| 676 | // Calc coeff/eob cost. |
| 677 | int eob_ctx = coeff_ctx->coef_eob; |
| 678 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 679 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 680 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 681 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 682 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 683 | |
| 684 | // Calc coeff mid and high range cost. |
| 685 | if (idx > 0) { |
| 686 | for (int i = 0; i < n_states; i++) { |
| 687 | int a0 = i & 2 ? 1 : 0; |
| 688 | int a1 = a0 + 2; |
| 689 | int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i], |
| 690 | txb_costs, 0, 0, 0); |
| 691 | int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i], |
| 692 | txb_costs, 0, 0, 0); |
| 693 | rd->rate[2 * i] += mid_cost0; |
| 694 | rd->rate[2 * i + 1] += mid_cost1; |
| 695 | } |
| 696 | int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], 0, 0, |
| 697 | txb_costs, tx_class, 0, 0); |
| 698 | int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], 0, 0, |
| 699 | txb_costs, tx_class, 0, 0); |
| 700 | rd->rate_eob[0] += eob_mid_cost0; |
| 701 | rd->rate_eob[1] += eob_mid_cost1; |
| 702 | } |
| 703 | } |
| 704 | |
| 705 | void av1_calc_lf_ctx_st4_avx2(const struct tcq_lf_ctx_t *lf_ctx, int scan_pos, |
| 706 | struct tcq_coeff_ctx_t *coeff_ctx) { |
| 707 | int n_states = 4; |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 708 | |
| 709 | int diag = kScanDiag[scan_pos]; |
| 710 | __m256i zero = _mm256_setzero_si256(); |
| 711 | __m256i nbr_mask = _mm256_lddqu_si256((__m256i *)kNbrMask[diag]); |
| 712 | __m256i base_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0); |
| 713 | __m256i mid_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0x11); |
| 714 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 715 | for (int st = 0; st < n_states; st += 4) { |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 716 | // Load previously decoded LF context values. |
| 717 | __m256i last01 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st]); |
| 718 | __m256i last23 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st + 2]); |
| 719 | |
| 720 | // Calc base ctx neighbor sum. |
| 721 | __m256i base01 = _mm256_min_epu8(last01, base_mask); |
| 722 | __m256i base23 = _mm256_min_epu8(last23, base_mask); |
| 723 | __m256i base01_sum = _mm256_sad_epu8(base01, zero); |
| 724 | __m256i base23_sum = _mm256_sad_epu8(base23, zero); |
| 725 | __m256i base_sum = |
| 726 | _mm256_hadd_epi32(base01_sum, base23_sum); // B0 B0 B2 B2 B1 B1 B3 B3 |
| 727 | |
| 728 | // Calc mid ctx neighbor sum. |
| 729 | __m256i mid01 = _mm256_min_epu8(last01, mid_mask); |
| 730 | __m256i mid23 = _mm256_min_epu8(last23, mid_mask); |
| 731 | __m256i mid01_sum = _mm256_sad_epu8(mid01, zero); |
| 732 | __m256i mid23_sum = _mm256_sad_epu8(mid23, zero); |
| 733 | __m256i mid_sum = |
| 734 | _mm256_hadd_epi32(mid01_sum, mid23_sum); // M0 M0 M2 M2 M1 M1 M3 M3 |
| 735 | |
| 736 | // Context calc; combine and reduce to 8 bits. |
| 737 | __m256i base_mid = |
| 738 | _mm256_hadd_epi32(base_sum, mid_sum); // B0B2 M0M2 B1B3 M1M3 |
| 739 | base_mid = _mm256_hadd_epi16( |
| 740 | base_mid, zero); // reduce to 16 bits B0B2 M0M2 - - B1B3 M1M3 - - |
| 741 | base_mid = _mm256_avg_epu16(base_mid, zero); // x = (x + 1) >> 1 |
| 742 | base_mid = _mm256_shufflelo_epi16( |
| 743 | base_mid, 0xD8); // shuffle B0M0 B2M2 - - B1M1 B3M3 - - |
| 744 | base_mid = _mm256_permute4x64_epi64( |
| 745 | base_mid, 0xD8); // pack into lower half: B0M0 B2M2 B1M1 B3M3 |
| 746 | base_mid = _mm256_shuffle_epi32(base_mid, 0xD8); // B0M0 B1M1 B2M2 B3M3 |
| 747 | __m256i six = _mm256_set1_epi16(6); |
| 748 | __m256i mid = _mm256_min_epi16(base_mid, six); |
| 749 | __m256i mid_sh4 = _mm256_slli_epi16(mid, 4); |
| 750 | __m256i base_max = _mm256_set1_epi16(kMaxCtx[scan_pos]); |
| 751 | __m256i base = _mm256_min_epi16(base_mid, base_max); |
| 752 | base_mid = _mm256_blend_epi16(base, mid_sh4, 0xAA); |
| 753 | __m256i ctx16 = _mm256_hadd_epi16(base_mid, base_mid); |
| 754 | __m256i mid_ctx_offset = _mm256_set1_epi16((scan_pos == 0) ? 0 : (7 << 4)); |
| 755 | ctx16 = _mm256_add_epi16(ctx16, mid_ctx_offset); |
| 756 | __m128i ctx8 = _mm256_castsi256_si128(ctx16); |
| 757 | ctx8 = _mm_packus_epi16(ctx8, ctx8); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 758 | #if 1 |
| 759 | // Older compilers don't implement _mm_storeu_si32() |
| 760 | _mm_store_ss((float *)&coeff_ctx->coef[st], _mm_castsi128_ps(ctx8)); |
| 761 | #else |
| 762 | _mm_storeu_si32(&coeff_ctx->coef[st], ctx8); |
| 763 | #endif |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 764 | } |
| 765 | } |
| 766 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 767 | void av1_calc_lf_ctx_st8_avx2(const struct tcq_lf_ctx_t *lf_ctx, int scan_pos, |
| 768 | struct tcq_coeff_ctx_t *coeff_ctx) { |
| 769 | int n_states = 8; |
| 770 | |
| 771 | int diag = kScanDiag[scan_pos]; |
| 772 | __m256i zero = _mm256_setzero_si256(); |
| 773 | __m256i nbr_mask = _mm256_lddqu_si256((__m256i *)kNbrMask[diag]); |
| 774 | __m256i base_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0); |
| 775 | __m256i mid_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0x11); |
| 776 | |
| 777 | for (int st = 0; st < n_states; st += 4) { |
| 778 | // Load previously decoded LF context values. |
| 779 | __m256i last01 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st]); |
| 780 | __m256i last23 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st + 2]); |
| 781 | |
| 782 | // Calc base ctx neighbor sum. |
| 783 | __m256i base01 = _mm256_min_epu8(last01, base_mask); |
| 784 | __m256i base23 = _mm256_min_epu8(last23, base_mask); |
| 785 | __m256i base01_sum = _mm256_sad_epu8(base01, zero); |
| 786 | __m256i base23_sum = _mm256_sad_epu8(base23, zero); |
| 787 | __m256i base_sum = |
| 788 | _mm256_hadd_epi32(base01_sum, base23_sum); // B0 B0 B2 B2 B1 B1 B3 B3 |
| 789 | |
| 790 | // Calc mid ctx neighbor sum. |
| 791 | __m256i mid01 = _mm256_min_epu8(last01, mid_mask); |
| 792 | __m256i mid23 = _mm256_min_epu8(last23, mid_mask); |
| 793 | __m256i mid01_sum = _mm256_sad_epu8(mid01, zero); |
| 794 | __m256i mid23_sum = _mm256_sad_epu8(mid23, zero); |
| 795 | __m256i mid_sum = |
| 796 | _mm256_hadd_epi32(mid01_sum, mid23_sum); // M0 M0 M2 M2 M1 M1 M3 M3 |
| 797 | |
| 798 | // Context calc; combine and reduce to 8 bits. |
| 799 | __m256i base_mid = |
| 800 | _mm256_hadd_epi32(base_sum, mid_sum); // B0B2 M0M2 B1B3 M1M3 |
| 801 | base_mid = _mm256_hadd_epi16( |
| 802 | base_mid, zero); // reduce to 16 bits B0B2 M0M2 - - B1B3 M1M3 - - |
| 803 | base_mid = _mm256_avg_epu16(base_mid, zero); // x = (x + 1) >> 1 |
| 804 | base_mid = _mm256_shufflelo_epi16( |
| 805 | base_mid, 0xD8); // shuffle B0M0 B2M2 - - B1M1 B3M3 - - |
| 806 | base_mid = _mm256_permute4x64_epi64( |
| 807 | base_mid, 0xD8); // pack into lower half: B0M0 B2M2 B1M1 B3M3 |
| 808 | base_mid = _mm256_shuffle_epi32(base_mid, 0xD8); // B0M0 B1M1 B2M2 B3M3 |
| 809 | __m256i six = _mm256_set1_epi16(6); |
| 810 | __m256i mid = _mm256_min_epi16(base_mid, six); |
| 811 | __m256i mid_sh4 = _mm256_slli_epi16(mid, 4); |
| 812 | __m256i base_max = _mm256_set1_epi16(kMaxCtx[scan_pos]); |
| 813 | __m256i base = _mm256_min_epi16(base_mid, base_max); |
| 814 | base_mid = _mm256_blend_epi16(base, mid_sh4, 0xAA); |
| 815 | __m256i ctx16 = _mm256_hadd_epi16(base_mid, base_mid); |
| 816 | __m256i mid_ctx_offset = _mm256_set1_epi16((scan_pos == 0) ? 0 : (7 << 4)); |
| 817 | ctx16 = _mm256_add_epi16(ctx16, mid_ctx_offset); |
| 818 | __m128i ctx8 = _mm256_castsi256_si128(ctx16); |
| 819 | ctx8 = _mm_packus_epi16(ctx8, ctx8); |
| 820 | #if 1 |
| 821 | // Older compilers don't implement _mm_storeu_si32() |
| 822 | _mm_store_ss((float *)&coeff_ctx->coef[st], _mm_castsi128_ps(ctx8)); |
| 823 | #else |
| 824 | _mm_storeu_si32(&coeff_ctx->coef[st], ctx8); |
| 825 | #endif |
| 826 | } |
| 827 | } |
| 828 | |
| 829 | void av1_update_lf_ctx_avx2(const struct tcq_node_t *decision, int n_states, |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 830 | struct tcq_lf_ctx_t *lf_ctx) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 831 | __m256i c_zero = _mm256_setzero_si256(); |
| 832 | __m256i upd_last_a = c_zero; |
| 833 | __m256i upd_last_b = c_zero; |
| 834 | __m256i upd_last_c = c_zero; |
| 835 | __m256i upd_last_d = c_zero; |
| 836 | |
| 837 | for (int st = 0; st < n_states; st += 2) { |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 838 | int absLevel0 = decision[st].absLevel; |
| 839 | int prevId0 = decision[st].prevId; |
| 840 | int absLevel1 = decision[st + 1].absLevel; |
| 841 | int prevId1 = decision[st + 1].prevId; |
| 842 | __m128i upd0 = _mm_setzero_si128(); |
| 843 | __m128i upd1 = _mm_setzero_si128(); |
| 844 | if (prevId0 >= 0) { |
| 845 | upd0 = _mm_lddqu_si128((__m128i *)lf_ctx[prevId0].last); |
| 846 | } |
| 847 | if (prevId1 >= 0) { |
| 848 | upd1 = _mm_lddqu_si128((__m128i *)lf_ctx[prevId1].last); |
| 849 | } |
| 850 | upd0 = _mm_slli_si128(upd0, 1); |
| 851 | upd1 = _mm_slli_si128(upd1, 1); |
| 852 | upd0 = _mm_insert_epi8(upd0, AOMMIN(absLevel0, INT8_MAX), 0); |
| 853 | upd1 = _mm_insert_epi8(upd1, AOMMIN(absLevel1, INT8_MAX), 0); |
| 854 | __m256i upd01 = _mm256_castsi128_si256(upd0); |
| 855 | upd01 = _mm256_inserti128_si256(upd01, upd1, 1); |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 856 | upd_last_d = upd_last_c; |
| 857 | upd_last_c = upd_last_b; |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 858 | upd_last_b = upd_last_a; |
| 859 | upd_last_a = upd01; |
| 860 | } |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 861 | if (n_states == 4) { |
| 862 | (void)upd_last_d; |
| 863 | (void)upd_last_c; |
| 864 | _mm256_storeu_si256((__m256i *)lf_ctx[0].last, upd_last_b); |
| 865 | _mm256_storeu_si256((__m256i *)lf_ctx[2].last, upd_last_a); |
| 866 | } else { |
| 867 | _mm256_storeu_si256((__m256i *)lf_ctx[0].last, upd_last_d); |
| 868 | _mm256_storeu_si256((__m256i *)lf_ctx[2].last, upd_last_c); |
| 869 | _mm256_storeu_si256((__m256i *)lf_ctx[4].last, upd_last_b); |
| 870 | _mm256_storeu_si256((__m256i *)lf_ctx[6].last, upd_last_a); |
| 871 | } |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 872 | } |
| 873 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 874 | void av1_get_rate_dist_lf_luma_avx2(const struct LV_MAP_COEFF_COST *txb_costs, |
| 875 | const struct prequant_t *pq, |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 876 | const struct tcq_coeff_ctx_t *coeff_ctx, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 877 | int blk_pos, int diag_ctx, int eob_rate, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 878 | int dc_sign_ctx, const int32_t *tmp_sign, |
| 879 | int bwl, TX_CLASS tx_class, int coeff_sign, |
| 880 | int n_states, struct tcq_rate_t *rd) { |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 881 | #define Z -1 |
| 882 | static const int8_t kShuf[2][32] = { |
| 883 | { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
| 884 | 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, |
| 885 | { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z, |
| 886 | 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z } |
| 887 | }; |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 888 | const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] = |
| 889 | txb_costs->base_lf_cost_zero; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 890 | const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 891 | txb_costs->base_lf_cost_low_tbl; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 892 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 893 | txb_costs->base_lf_eob_cost_tbl; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 894 | const tran_low_t *absLevel = pq->absLevel; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 895 | const int plane = 0; |
| 896 | |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 897 | // Calc zero coeff costs. |
| 898 | __m256i cost_zero_dq0 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 899 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 900 | __m256i cost_zero_dq1 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 901 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 902 | __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]); |
| 903 | cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf); |
| 904 | cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf); |
| 905 | __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8); |
| 906 | __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 907 | __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 908 | __m256i fifteen = _mm256_set1_epi8(15); |
| 909 | __m256i base_ctx = _mm256_and_si256(ctx, fifteen); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 910 | __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0); |
| 911 | __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1); |
| 912 | __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 913 | __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA); |
| 914 | ratez = _mm256_permute4x64_epi64(ratez, 0x88); |
| 915 | __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]); |
| 916 | ratez = _mm256_shuffle_epi8(ratez, shuf1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 917 | _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 918 | |
| 919 | // Calc coeff_base rate. |
| 920 | int idx = AOMMIN(pq->qIdx - 1, 8); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 921 | __m128i c_zero = _mm_setzero_si128(); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 922 | __m256i diag = _mm256_set1_epi8(diag_ctx); |
| 923 | base_ctx = _mm256_add_epi8(base_ctx, diag); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 924 | for (int i = 0; i < (n_states >> 2); i++) { |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 925 | int ctx0 = _mm256_extract_epi8(base_ctx, 0); |
| 926 | int ctx1 = _mm256_extract_epi8(base_ctx, 1); |
| 927 | int ctx2 = _mm256_extract_epi8(base_ctx, 2); |
| 928 | int ctx3 = _mm256_extract_epi8(base_ctx, 3); |
| 929 | base_ctx = _mm256_bsrli_epi128(base_ctx, 4); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 930 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 931 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 932 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 933 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 934 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 935 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 936 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 937 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 938 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 939 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 940 | } |
| 941 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 942 | // Calc coeff/eob cost. |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 943 | int eob_ctx = coeff_ctx->coef_eob; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 944 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 945 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 946 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 947 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 948 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 949 | |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 950 | const int row = blk_pos >> bwl; |
| 951 | const int col = blk_pos - (row << bwl); |
| 952 | const bool dc_2dtx = (blk_pos == 0); |
| 953 | const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ; |
| 954 | const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT; |
| 955 | const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver; |
| 956 | if (is_dc_coeff) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 957 | for (int i = 0; i < n_states; i++) { |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 958 | int a0 = i & 2 ? 1 : 0; |
| 959 | int a1 = a0 + 2; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 960 | int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign, |
| 961 | coeff_ctx->coef[i], dc_sign_ctx, |
| 962 | txb_costs, tmp_sign, plane); |
| 963 | int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign, |
| 964 | coeff_ctx->coef[i], dc_sign_ctx, |
| 965 | txb_costs, tmp_sign, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 966 | rd->rate[2 * i] += mid_cost0; |
| 967 | rd->rate[2 * i + 1] += mid_cost1; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 968 | } |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 969 | int t_sign = tmp_sign[blk_pos]; |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 970 | int eob_mid_cost0 = |
| 971 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx, |
| 972 | txb_costs, tx_class, t_sign, 0); |
| 973 | int eob_mid_cost1 = |
| 974 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx, |
| 975 | txb_costs, tx_class, t_sign, 0); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 976 | rd->rate_eob[0] += eob_mid_cost0; |
| 977 | rd->rate_eob[1] += eob_mid_cost1; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 978 | } else if (idx > 4) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 979 | for (int i = 0; i < n_states; i++) { |
| 980 | int a0 = i & 2 ? 1 : 0; |
| 981 | int a1 = a0 + 2; |
| 982 | int mid_cost0 = |
| 983 | get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane); |
| 984 | int mid_cost1 = |
| 985 | get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane); |
| 986 | rd->rate[2 * i] += mid_cost0; |
| 987 | rd->rate[2 * i + 1] += mid_cost1; |
| 988 | } |
| 989 | int t_sign = tmp_sign[blk_pos]; |
| 990 | int eob_mid_cost0 = |
| 991 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx, |
| 992 | txb_costs, tx_class, t_sign, 0); |
| 993 | int eob_mid_cost1 = |
| 994 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx, |
| 995 | txb_costs, tx_class, t_sign, 0); |
| 996 | rd->rate_eob[0] += eob_mid_cost0; |
| 997 | rd->rate_eob[1] += eob_mid_cost1; |
| 998 | } |
| 999 | } |
| 1000 | |
| 1001 | void av1_get_rate_dist_lf_luma_st4_avx2( |
| 1002 | const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq, |
| 1003 | const struct tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int diag_ctx, |
| 1004 | int eob_rate, int dc_sign_ctx, const int32_t *tmp_sign, int bwl, |
| 1005 | TX_CLASS tx_class, int coeff_sign, int n_states, struct tcq_rate_t *rd) { |
| 1006 | assert(n_states == 4); |
| 1007 | n_states = 4; |
| 1008 | #define Z -1 |
| 1009 | static const int8_t kShuf[2][32] = { |
| 1010 | { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
| 1011 | 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, |
| 1012 | { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z, |
| 1013 | 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z } |
| 1014 | }; |
| 1015 | const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] = |
| 1016 | txb_costs->base_lf_cost_zero; |
| 1017 | const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 1018 | txb_costs->base_lf_cost_low_tbl; |
| 1019 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 1020 | txb_costs->base_lf_eob_cost_tbl; |
| 1021 | const tran_low_t *absLevel = pq->absLevel; |
| 1022 | const int plane = 0; |
| 1023 | |
| 1024 | // Calc zero coeff costs. |
| 1025 | __m256i cost_zero_dq0 = |
| 1026 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
| 1027 | __m256i cost_zero_dq1 = |
| 1028 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
| 1029 | __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]); |
| 1030 | cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf); |
| 1031 | cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf); |
| 1032 | __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8); |
| 1033 | __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8); |
| 1034 | __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
| 1035 | __m256i fifteen = _mm256_set1_epi8(15); |
| 1036 | __m256i base_ctx = _mm256_and_si256(ctx, fifteen); |
| 1037 | __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0); |
| 1038 | __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1); |
| 1039 | __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1); |
| 1040 | __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA); |
| 1041 | ratez = _mm256_permute4x64_epi64(ratez, 0x88); |
| 1042 | __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]); |
| 1043 | ratez = _mm256_shuffle_epi8(ratez, shuf1); |
| 1044 | _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez); |
| 1045 | |
| 1046 | // Calc coeff_base rate. |
| 1047 | int idx = AOMMIN(pq->qIdx - 1, 8); |
| 1048 | __m128i c_zero = _mm_setzero_si128(); |
| 1049 | __m256i diag = _mm256_set1_epi8(diag_ctx); |
| 1050 | base_ctx = _mm256_add_epi8(base_ctx, diag); |
| 1051 | for (int i = 0; i < (n_states >> 2); i++) { |
| 1052 | int ctx0 = _mm256_extract_epi8(base_ctx, 0); |
| 1053 | int ctx1 = _mm256_extract_epi8(base_ctx, 1); |
| 1054 | int ctx2 = _mm256_extract_epi8(base_ctx, 2); |
| 1055 | int ctx3 = _mm256_extract_epi8(base_ctx, 3); |
| 1056 | base_ctx = _mm256_bsrli_epi128(base_ctx, 4); |
| 1057 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 1058 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 1059 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 1060 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 1061 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 1062 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
| 1063 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 1064 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
| 1065 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 1066 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
| 1067 | } |
| 1068 | |
| 1069 | // Calc coeff/eob cost. |
| 1070 | int eob_ctx = coeff_ctx->coef_eob; |
| 1071 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 1072 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 1073 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 1074 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 1075 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 1076 | |
| 1077 | const int row = blk_pos >> bwl; |
| 1078 | const int col = blk_pos - (row << bwl); |
| 1079 | const bool dc_2dtx = (blk_pos == 0); |
| 1080 | const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ; |
| 1081 | const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT; |
| 1082 | const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver; |
| 1083 | if (is_dc_coeff) { |
| 1084 | for (int i = 0; i < n_states; i++) { |
| 1085 | int a0 = i & 2 ? 1 : 0; |
| 1086 | int a1 = a0 + 2; |
| 1087 | int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign, |
| 1088 | coeff_ctx->coef[i], dc_sign_ctx, |
| 1089 | txb_costs, tmp_sign, plane); |
| 1090 | int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign, |
| 1091 | coeff_ctx->coef[i], dc_sign_ctx, |
| 1092 | txb_costs, tmp_sign, plane); |
| 1093 | rd->rate[2 * i] += mid_cost0; |
| 1094 | rd->rate[2 * i + 1] += mid_cost1; |
| 1095 | } |
| 1096 | int t_sign = tmp_sign[blk_pos]; |
| 1097 | int eob_mid_cost0 = |
| 1098 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx, |
| 1099 | txb_costs, tx_class, t_sign, 0); |
| 1100 | int eob_mid_cost1 = |
| 1101 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx, |
| 1102 | txb_costs, tx_class, t_sign, 0); |
| 1103 | rd->rate_eob[0] += eob_mid_cost0; |
| 1104 | rd->rate_eob[1] += eob_mid_cost1; |
| 1105 | } else if (idx > 4) { |
| 1106 | for (int i = 0; i < n_states; i++) { |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 1107 | int a0 = i & 2 ? 1 : 0; |
| 1108 | int a1 = a0 + 2; |
| 1109 | int mid_cost0 = |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1110 | get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane); |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 1111 | int mid_cost1 = |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1112 | get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1113 | rd->rate[2 * i] += mid_cost0; |
| 1114 | rd->rate[2 * i + 1] += mid_cost1; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 1115 | } |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1116 | int t_sign = tmp_sign[blk_pos]; |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1117 | int eob_mid_cost0 = |
| 1118 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx, |
| 1119 | txb_costs, tx_class, t_sign, 0); |
| 1120 | int eob_mid_cost1 = |
| 1121 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx, |
| 1122 | txb_costs, tx_class, t_sign, 0); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1123 | rd->rate_eob[0] += eob_mid_cost0; |
| 1124 | rd->rate_eob[1] += eob_mid_cost1; |
Joe Young | e5046fe | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 1125 | } |
| 1126 | } |
| 1127 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1128 | void av1_get_rate_dist_lf_chroma_avx2(const struct LV_MAP_COEFF_COST *txb_costs, |
| 1129 | const struct prequant_t *pq, |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1130 | const struct tcq_coeff_ctx_t *coeff_ctx, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1131 | int blk_pos, int diag_ctx, int eob_rate, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1132 | int dc_sign_ctx, const int32_t *tmp_sign, |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1133 | int bwl, TX_CLASS tx_class, int plane, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1134 | int coeff_sign, int n_states, |
| 1135 | struct tcq_rate_t *rd) { |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1136 | (void)bwl; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1137 | #define Z -1 |
| 1138 | static const int8_t kShuf[2][32] = { |
| 1139 | { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
| 1140 | 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, |
| 1141 | { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z, |
| 1142 | 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z } |
| 1143 | }; |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1144 | const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] = |
| 1145 | plane ? txb_costs->base_lf_cost_uv_zero : txb_costs->base_lf_cost_zero; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1146 | const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 1147 | plane ? txb_costs->base_lf_cost_uv_low_tbl |
| 1148 | : txb_costs->base_lf_cost_low_tbl; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1149 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 1150 | txb_costs->base_lf_eob_cost_uv_tbl; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1151 | const tran_low_t *absLevel = pq->absLevel; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1152 | |
| 1153 | // Calc zero coeff costs. |
| 1154 | __m256i cost_zero_dq0 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1155 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1156 | __m256i cost_zero_dq1 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1157 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1158 | __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]); |
| 1159 | cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf); |
| 1160 | cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf); |
| 1161 | __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8); |
| 1162 | __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1163 | __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1164 | __m256i fifteen = _mm256_set1_epi8(15); |
| 1165 | __m256i base_ctx = _mm256_and_si256(ctx, fifteen); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1166 | __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0); |
| 1167 | __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1); |
| 1168 | __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1169 | __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA); |
| 1170 | ratez = _mm256_permute4x64_epi64(ratez, 0x88); |
| 1171 | __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]); |
| 1172 | ratez = _mm256_shuffle_epi8(ratez, shuf1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1173 | _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1174 | |
| 1175 | // Calc coeff_base rate. |
| 1176 | int idx = AOMMIN(pq->qIdx - 1, 8); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1177 | __m128i c_zero = _mm_setzero_si128(); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1178 | __m256i diag = _mm256_set1_epi8(diag_ctx); |
| 1179 | base_ctx = _mm256_add_epi8(base_ctx, diag); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1180 | for (int i = 0; i < (n_states >> 2); i++) { |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1181 | int ctx0 = _mm256_extract_epi8(base_ctx, 0); |
| 1182 | int ctx1 = _mm256_extract_epi8(base_ctx, 1); |
| 1183 | int ctx2 = _mm256_extract_epi8(base_ctx, 2); |
| 1184 | int ctx3 = _mm256_extract_epi8(base_ctx, 3); |
| 1185 | base_ctx = _mm256_bsrli_epi128(base_ctx, 4); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1186 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 1187 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 1188 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 1189 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 1190 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 1191 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1192 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 1193 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1194 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 1195 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1196 | } |
| 1197 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1198 | // Calc coeff/eob cost. |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1199 | int eob_ctx = coeff_ctx->coef_eob; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1200 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 1201 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 1202 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 1203 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 1204 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 1205 | |
| 1206 | // Chroma LF region consists of only DC coeffs. |
| 1207 | #if 1 |
| 1208 | const int is_dc_coeff = 1; |
| 1209 | #else |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1210 | const int row = blk_pos >> bwl; |
| 1211 | const int col = blk_pos - (row << bwl); |
| 1212 | const bool dc_2dtx = (blk_pos == 0); |
| 1213 | const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ; |
| 1214 | const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT; |
| 1215 | const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1216 | #endif |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1217 | if (is_dc_coeff) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1218 | for (int i = 0; i < n_states; i++) { |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1219 | int a0 = i & 2 ? 1 : 0; |
| 1220 | int a1 = a0 + 2; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1221 | int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign, |
| 1222 | coeff_ctx->coef[i], dc_sign_ctx, |
| 1223 | txb_costs, tmp_sign, plane); |
| 1224 | int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign, |
| 1225 | coeff_ctx->coef[i], dc_sign_ctx, |
| 1226 | txb_costs, tmp_sign, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1227 | rd->rate[2 * i] += mid_cost0; |
| 1228 | rd->rate[2 * i + 1] += mid_cost1; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1229 | } |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1230 | int t_sign = tmp_sign[blk_pos]; |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1231 | int eob_mid_cost0 = |
| 1232 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx, |
| 1233 | txb_costs, tx_class, t_sign, plane); |
| 1234 | int eob_mid_cost1 = |
| 1235 | get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx, |
| 1236 | txb_costs, tx_class, t_sign, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1237 | rd->rate_eob[0] += eob_mid_cost0; |
| 1238 | rd->rate_eob[1] += eob_mid_cost1; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1239 | } else if (idx > 4) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1240 | for (int i = 0; i < n_states; i++) { |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1241 | int a0 = i & 2 ? 1 : 0; |
| 1242 | int a1 = a0 + 2; |
| 1243 | int mid_cost0 = |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1244 | get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1245 | int mid_cost1 = |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1246 | get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1247 | rd->rate[2 * i] += mid_cost0; |
| 1248 | rd->rate[2 * i + 1] += mid_cost1; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1249 | } |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1250 | int t_sign = tmp_sign[blk_pos]; |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1251 | int eob_mid_cost0 = |
| 1252 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx, |
| 1253 | txb_costs, tx_class, t_sign, plane); |
| 1254 | int eob_mid_cost1 = |
| 1255 | get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx, |
| 1256 | txb_costs, tx_class, t_sign, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1257 | rd->rate_eob[0] += eob_mid_cost0; |
| 1258 | rd->rate_eob[1] += eob_mid_cost1; |
Joe Young | 463ba7f | 2024-06-25 14:27:02 -0700 | [diff] [blame] | 1259 | } |
| 1260 | } |
| 1261 | |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1262 | void av1_get_rate_dist_def_chroma_avx2( |
| 1263 | const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq, |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1264 | const struct tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int bwl, |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1265 | TX_CLASS tx_class, int diag_ctx, int eob_rate, int plane, int t_sign, |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1266 | int sign, int n_states, struct tcq_rate_t *rd) { |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1267 | (void)bwl; |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1268 | const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_uv_zero; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1269 | const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] = |
| 1270 | txb_costs->base_cost_uv_low_tbl; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1271 | const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] = |
| 1272 | txb_costs->base_eob_cost_uv_tbl; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1273 | const tran_low_t *absLevel = pq->absLevel; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1274 | |
| 1275 | // Calc zero coeff costs. |
| 1276 | __m256i zero = _mm256_setzero_si256(); |
| 1277 | __m256i cost_zero_dq0 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1278 | _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1279 | __m256i cost_zero_dq1 = |
Joe Young | fb9653f | 2024-08-28 08:35:13 -0700 | [diff] [blame] | 1280 | _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1281 | __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef)); |
| 1282 | __m256i ctx16 = _mm256_unpacklo_epi8(ctx, zero); |
| 1283 | __m256i ctx16sh = _mm256_shuffle_epi32(ctx16, 0xD8); |
| 1284 | __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx16sh, zero); |
| 1285 | __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx16sh, zero); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1286 | __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0); |
| 1287 | __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1); |
| 1288 | __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1289 | _mm_storeu_si128((__m128i *)&rd->rate_zero[0], |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1290 | _mm256_castsi256_si128(ratez_0123)); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1291 | __m256i ratez_4567 = _mm256_unpackhi_epi64(ratez_dq0, ratez_dq1); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1292 | _mm_storeu_si128((__m128i *)&rd->rate_zero[4], |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1293 | _mm256_castsi256_si128(ratez_4567)); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1294 | |
| 1295 | // Calc coeff_base rate. |
| 1296 | int idx = AOMMIN(pq->qIdx - 1, 4); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1297 | __m128i c_zero = _mm_setzero_si128(); |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1298 | __m256i diag = _mm256_set1_epi16(diag_ctx); |
| 1299 | __m256i base_ctx = _mm256_slli_epi16(ctx16, 12); |
| 1300 | base_ctx = _mm256_srli_epi16(base_ctx, 12); |
| 1301 | base_ctx = _mm256_add_epi16(base_ctx, diag); |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1302 | for (int i = 0; i < (n_states >> 2); i++) { |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1303 | int ctx0 = _mm256_extract_epi16(base_ctx, 0); |
| 1304 | int ctx1 = _mm256_extract_epi16(base_ctx, 1); |
| 1305 | int ctx2 = _mm256_extract_epi16(base_ctx, 2); |
| 1306 | int ctx3 = _mm256_extract_epi16(base_ctx, 3); |
| 1307 | base_ctx = _mm256_bsrli_epi128(base_ctx, 8); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1308 | __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]); |
| 1309 | __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]); |
| 1310 | __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]); |
| 1311 | __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]); |
| 1312 | __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23); |
| 1313 | __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1314 | rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero); |
| 1315 | rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1316 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123); |
| 1317 | _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567); |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1318 | } |
| 1319 | |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1320 | // Calc coeff/eob cost. |
Joe Young | d3ef83a | 2024-08-23 11:49:06 -0700 | [diff] [blame] | 1321 | int eob_ctx = coeff_ctx->coef_eob; |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1322 | __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]); |
| 1323 | rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero); |
| 1324 | __m128i rate_eob_position = _mm_set1_epi32(eob_rate); |
| 1325 | __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position); |
| 1326 | _mm_storeu_si64(&rd->rate_eob[0], rate_eob); |
| 1327 | |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1328 | // Calc coeff mid and high range cost. |
| 1329 | if (idx > 0 || plane) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1330 | for (int i = 0; i < n_states; i++) { |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1331 | int a0 = i & 2 ? 1 : 0; |
| 1332 | int a1 = a0 + 2; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1333 | int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i], |
| 1334 | txb_costs, plane, t_sign, sign); |
| 1335 | int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i], |
| 1336 | txb_costs, plane, t_sign, sign); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1337 | rd->rate[2 * i] += mid_cost0; |
| 1338 | rd->rate[2 * i + 1] += mid_cost1; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1339 | } |
Joe Young | ee165d2 | 2024-08-19 10:57:07 -0700 | [diff] [blame] | 1340 | int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], sign, 0, |
| 1341 | txb_costs, tx_class, t_sign, plane); |
| 1342 | int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], sign, 0, |
| 1343 | txb_costs, tx_class, t_sign, plane); |
Joe Young | 19695f3 | 2024-08-16 15:12:02 -0700 | [diff] [blame] | 1344 | rd->rate_eob[0] += eob_mid_cost0; |
| 1345 | rd->rate_eob[1] += eob_mid_cost1; |
Joe Young | bdde868 | 2024-08-07 09:33:48 -0700 | [diff] [blame] | 1346 | } |
| 1347 | } |
Joe Young | af03d88 | 2024-08-13 11:11:36 -0700 | [diff] [blame] | 1348 | |
| 1349 | void av1_init_lf_ctx_avx2(const uint8_t *lev, int scan_hi, int bwl, |
| 1350 | struct tcq_lf_ctx_t *lf_ctx) { |
| 1351 | // Sample offsets (row/col) in and around the LF region used for ctx calc. |
| 1352 | const uint8_t diag_scan[21] = { 0x00, 0x10, 0x01, 0x20, 0x11, 0x02, 0x30, |
| 1353 | 0x21, 0x12, 0x03, 0x40, 0x31, 0x22, 0x13, |
| 1354 | 0x04, 0x50, 0x41, 0x32, 0x23, 0x14, 0x05 }; |
| 1355 | const int8_t kShuf[16] = { 8, 6, 4, 2, 0, 11, 9, 7, |
| 1356 | 5, 3, 1, -1, -1, -1, -1, -1 }; |
| 1357 | __m128i zero = _mm_setzero_si128(); |
| 1358 | |
| 1359 | int eob_inside_lf_region = scan_hi < MAX_LF_SCAN - 1; |
| 1360 | if (eob_inside_lf_region) { |
| 1361 | // Retrive the EOB value and store in LF ctx. |
| 1362 | int row_col = diag_scan[scan_hi + 1]; |
| 1363 | int row = row_col >> 4; |
| 1364 | int col = row_col & 15; |
| 1365 | int blk_pos = (row << bwl) + col; |
| 1366 | uint8_t lev0 = lev[get_padded_idx(blk_pos, bwl)]; |
| 1367 | __m128i last = _mm_insert_epi8(zero, lev0, 0); |
| 1368 | _mm_storeu_si128((__m128i *)lf_ctx->last, last); |
| 1369 | } else { |
| 1370 | // Retrieve samples in the two diagonals bordering LF region. |
| 1371 | int offset = (1 << bwl) + TX_PAD_HOR - 1; |
| 1372 | const uint8_t *p = lev + 4; |
| 1373 | __m128i row0 = _mm_loadu_si64(p); |
| 1374 | __m128i row1 = _mm_loadu_si64(p + offset); |
| 1375 | __m128i row2 = _mm_loadu_si64(p + 2 * offset); |
| 1376 | __m128i row3 = _mm_loadu_si64(p + 3 * offset); |
| 1377 | __m128i row4 = _mm_loadu_si64(p + 4 * offset); |
| 1378 | __m128i row5 = _mm_loadu_si64(p + 5 * offset); |
| 1379 | __m128i row01 = _mm_unpacklo_epi16(row0, row1); |
| 1380 | __m128i row23 = _mm_unpacklo_epi16(row2, row3); |
| 1381 | __m128i row45 = _mm_unpacklo_epi16(row4, row5); |
| 1382 | __m128i row0123 = _mm_unpacklo_epi32(row01, row23); |
| 1383 | __m128i row012345 = _mm_unpacklo_epi64(row0123, row45); |
| 1384 | __m128i shuf = _mm_lddqu_si128((__m128i *)kShuf); |
| 1385 | __m128i last = _mm_shuffle_epi8(row012345, shuf); |
| 1386 | _mm_storeu_si128((__m128i *)lf_ctx->last, last); |
| 1387 | } |
| 1388 | } |
Joe Young | a91384f | 2024-08-20 13:07:18 -0700 | [diff] [blame] | 1389 | |
| 1390 | // Pre-calculate eob bits (rate) for each EOB candidate position from 1 |
| 1391 | // to the initial eob location. Store rate in array block_eob_rate[], |
| 1392 | // starting with index. |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1393 | void av1_calc_block_eob_rate_avx2(struct macroblock *x, int plane, |
| 1394 | TX_SIZE tx_size, int eob, |
Joe Young | a91384f | 2024-08-20 13:07:18 -0700 | [diff] [blame] | 1395 | uint16_t *block_eob_rate) { |
| 1396 | const MACROBLOCKD *xd = &x->e_mbd; |
| 1397 | const MB_MODE_INFO *mbmi = xd->mi[0]; |
| 1398 | const int is_inter = is_inter_block(mbmi, xd->tree_type); |
| 1399 | const PLANE_TYPE plane_type = get_plane_type(plane); |
| 1400 | const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); |
| 1401 | const CoeffCosts *coeff_costs = &x->coeff_costs; |
| 1402 | const LV_MAP_COEFF_COST *txb_costs = |
| 1403 | &coeff_costs->coeff_costs[txs_ctx][plane_type]; |
| 1404 | const int eob_multi_size = txsize_log2_minus4[tx_size]; |
| 1405 | const LV_MAP_EOB_COST *txb_eob_costs = |
| 1406 | &coeff_costs->eob_costs[eob_multi_size][plane_type]; |
| 1407 | |
| 1408 | #if CONFIG_EOB_POS_LUMA |
| 1409 | const int *tbl_eob_cost = txb_eob_costs->eob_cost[is_inter]; |
| 1410 | #else |
| 1411 | const int *tbl_eob_cost = txb_eob_costs->eob_cost; |
| 1412 | #endif |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1413 | const int(*tbl_eob_extra)[2] = txb_costs->eob_extra_cost; |
Joe Young | a91384f | 2024-08-20 13:07:18 -0700 | [diff] [blame] | 1414 | |
| 1415 | static const int8_t kShuf[4][32] = { |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1416 | { -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13, |
| 1417 | 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5 }, |
| 1418 | { 0, 1, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13, |
| 1419 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, |
| 1420 | { 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, |
| 1421 | 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13 }, |
Joe Young | a91384f | 2024-08-20 13:07:18 -0700 | [diff] [blame] | 1422 | }; |
| 1423 | #define BC1 (1 << AV1_PROB_COST_SHIFT) |
| 1424 | #define BC2 (2 * BC1) |
| 1425 | static const uint16_t kBitCost[16] = { |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1426 | 0, 0, 0, 0, BC1, BC1, BC1, BC1, BC2, BC2, BC2, BC2, BC2, BC2, BC2, BC2 |
Joe Young | a91384f | 2024-08-20 13:07:18 -0700 | [diff] [blame] | 1427 | }; |
| 1428 | |
| 1429 | // Write first 16 costs, block_eob_rate[0:15] |
| 1430 | // Convert 32-bit eob_pt costs { 0 1 2 3 4 5 6 7 } + eob_extra_cost |
| 1431 | // to expanded 16-bit costs { 0 1 2 2 3 3 3 3 4 4 4 4 4 4 4 4 }. |
| 1432 | __m256i eob_cost0_7 = _mm256_lddqu_si256((__m256i *)tbl_eob_cost); |
| 1433 | __m256i eob_extra0_7 = _mm256_lddqu_si256((__m256i *)tbl_eob_extra); |
| 1434 | __m256i shuf0 = _mm256_lddqu_si256((__m256i *)kShuf[0]); |
| 1435 | __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]); |
| 1436 | __m256i eob_extra = _mm256_shuffle_epi8(eob_extra0_7, shuf0); |
| 1437 | __m256i eob_rate0_15 = _mm256_shuffle_epi8(eob_cost0_7, shuf1); |
| 1438 | eob_rate0_15 = _mm256_add_epi16(eob_rate0_15, eob_extra); |
| 1439 | __m256i bit_cost = _mm256_lddqu_si256((__m256i *)kBitCost); |
| 1440 | eob_rate0_15 = _mm256_add_epi16(eob_rate0_15, bit_cost); |
| 1441 | _mm256_storeu_si256((__m256i *)&block_eob_rate[0], eob_rate0_15); |
| 1442 | |
| 1443 | // Write second 16 costs, block_eob_rate[16:31] |
| 1444 | __m256i eob_cost4_7 = _mm256_permute4x64_epi64(eob_cost0_7, 0xEE); |
| 1445 | __m256i eob_extra4_7 = _mm256_permute4x64_epi64(eob_extra0_7, 0xEE); |
| 1446 | __m256i shuf2 = _mm256_lddqu_si256((__m256i *)kShuf[2]); |
| 1447 | __m256i shuf3 = _mm256_set1_epi16(0x0504); |
| 1448 | __m256i eob_extra16_31 = _mm256_shuffle_epi8(eob_extra4_7, shuf2); |
| 1449 | __m256i eob_rate16_31 = _mm256_shuffle_epi8(eob_cost4_7, shuf3); |
| 1450 | eob_rate16_31 = _mm256_add_epi16(eob_rate16_31, eob_extra16_31); |
| 1451 | __m256i bit_cost16_31 = _mm256_set1_epi16(3 * BC1); |
| 1452 | eob_rate16_31 = _mm256_add_epi16(eob_rate16_31, bit_cost16_31); |
| 1453 | _mm256_storeu_si256((__m256i *)&block_eob_rate[16], eob_rate16_31); |
| 1454 | |
| 1455 | // Write costs beyond position 32, block_eob_rate[32+] |
| 1456 | int scan_pos = 32; |
| 1457 | int n_offset_bits = 4; |
| 1458 | while (scan_pos < eob) { |
| 1459 | int eob_pt_rate = tbl_eob_cost[2 + n_offset_bits]; |
| 1460 | for (int bit = 0; bit < 2; bit++) { |
| 1461 | int eob_ctx = n_offset_bits; |
| 1462 | int extra_bit_rate = tbl_eob_extra[eob_ctx][bit]; |
| 1463 | int eob_rate = |
| 1464 | eob_pt_rate + extra_bit_rate + av1_cost_literal(n_offset_bits); |
| 1465 | for (int i = 0; i < (1 << n_offset_bits); i += 16) { |
| 1466 | __m256i rate = _mm256_set1_epi16(eob_rate); |
| 1467 | _mm256_storeu_si256((__m256i *)&block_eob_rate[scan_pos], rate); |
| 1468 | scan_pos += 16; |
| 1469 | } |
| 1470 | } |
| 1471 | n_offset_bits++; |
| 1472 | } |
| 1473 | } |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1474 | |
| 1475 | static AOM_FORCE_INLINE int get_dqv(const int32_t *dequant, int coeff_idx, |
| 1476 | const qm_val_t *iqmatrix) { |
| 1477 | int dqv = dequant[!!coeff_idx]; |
| 1478 | if (iqmatrix != NULL) |
| 1479 | dqv = |
| 1480 | ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; |
| 1481 | return dqv; |
| 1482 | } |
| 1483 | |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1484 | int av1_find_best_path_avx2(const struct tcq_node_t *trellis, int n_states_log2, |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1485 | const int16_t *scan, const int32_t *dequant, |
| 1486 | const qm_val_t *iqmatrix, const tran_low_t *tcoeff, |
| 1487 | int first_scan_pos, int log_scale, |
| 1488 | tran_low_t *qcoeff, tran_low_t *dqcoeff, |
| 1489 | int *min_rate, int64_t *min_cost) { |
| 1490 | // Select best trellis state. |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1491 | int n_states = 1 << n_states_log2; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1492 | int64_t min_path_cost = INT64_MAX; |
| 1493 | int trel_min_rate = INT32_MAX; |
| 1494 | int prev_id = -2; |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1495 | for (int state = 0; state < n_states; state++) { |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1496 | const tcq_node_t *decision = &trellis[state]; |
| 1497 | if (decision->rdCost < min_path_cost) { |
| 1498 | prev_id = state; |
| 1499 | min_path_cost = decision->rdCost; |
| 1500 | trel_min_rate = decision->rate; |
| 1501 | } |
| 1502 | } |
| 1503 | |
| 1504 | // Backtrack to reconstruct qcoeff / dqcoeff blocks. |
| 1505 | int scan_pos = 0; |
| 1506 | if (!iqmatrix) { |
| 1507 | __m128i dqv = _mm_loadu_si64(dequant); |
| 1508 | __m128i dqv_ac = _mm_srli_si128(dqv, 4); |
| 1509 | __m128i zero = _mm_setzero_si128(); |
| 1510 | __m128i round = _mm_set1_epi64x(1 << (QUANT_TABLE_BITS - 1)); |
| 1511 | int shift = QUANT_TABLE_BITS + log_scale; |
| 1512 | for (; prev_id >= 0; scan_pos++) { |
| 1513 | const int32_t *decision = |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1514 | (int32_t *)&trellis[(scan_pos << n_states_log2) + prev_id]; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1515 | __m128i info = _mm_loadu_si64(&decision[3]); |
| 1516 | int blk_pos = scan[scan_pos]; |
| 1517 | __m128i sign = _mm_loadu_si64(&tcoeff[blk_pos]); |
| 1518 | sign = _mm_srai_epi32(sign, 31); |
| 1519 | __m128i abs_lev = _mm_slli_epi32(info, 8); |
| 1520 | __m128i abs_lev2 = _mm_srli_epi32(abs_lev, 7); |
| 1521 | abs_lev = _mm_srli_epi32(abs_lev, 8); |
Joe Young | bd3d157 | 2024-09-12 12:57:14 -0700 | [diff] [blame] | 1522 | __m128i dq = _mm_slli_epi32(info, 6); |
| 1523 | dq = _mm_srli_epi32(dq, 31); |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1524 | __m128i dq_mask = _mm_srai_epi32(info, 31); |
| 1525 | dq = _mm_andnot_si128(dq_mask, dq); |
| 1526 | abs_lev2 = _mm_sub_epi32(abs_lev2, dq); |
| 1527 | abs_lev2 = _mm_max_epi32(abs_lev2, zero); |
| 1528 | __m128i dqc = _mm_mul_epi32(abs_lev2, dqv); |
| 1529 | dqc = _mm_add_epi64(dqc, round); |
| 1530 | dqc = _mm_srli_epi64(dqc, shift); |
| 1531 | dqc = _mm_xor_si128(dqc, sign); |
| 1532 | dqc = _mm_sub_epi32(dqc, sign); |
| 1533 | __m128i lev = _mm_xor_si128(abs_lev, sign); |
| 1534 | lev = _mm_sub_epi32(lev, sign); |
| 1535 | #if 1 |
| 1536 | // Older compilers don't implement _mm_storeu_si32() |
| 1537 | _mm_store_ss((float *)&qcoeff[blk_pos], _mm_castsi128_ps(lev)); |
| 1538 | _mm_store_ss((float *)&dqcoeff[blk_pos], _mm_castsi128_ps(dqc)); |
| 1539 | #else |
| 1540 | _mm_storeu_si32(&qcoeff[blk_pos], lev); |
| 1541 | _mm_storeu_si32(&dqcoeff[blk_pos], dqc); |
| 1542 | #endif |
| 1543 | dqv = dqv_ac; |
| 1544 | __m128i prevId = _mm_srai_epi32(info, 24); |
| 1545 | prev_id = _mm_extract_epi32(prevId, 0); |
| 1546 | } |
| 1547 | } else { |
| 1548 | for (; prev_id >= 0; scan_pos++) { |
Joe Young | 3db806d | 2024-09-17 09:47:11 -0700 | [diff] [blame^] | 1549 | const tcq_node_t *decision = |
| 1550 | &trellis[(scan_pos << n_states_log2) + prev_id]; |
Joe Young | 3356443 | 2024-08-23 15:21:54 -0700 | [diff] [blame] | 1551 | prev_id = decision->prevId; |
| 1552 | int abs_level = decision->absLevel; |
| 1553 | int blk_pos = scan[scan_pos]; |
| 1554 | int sign = tcoeff[blk_pos] < 0; |
| 1555 | qcoeff[blk_pos] = sign ? -abs_level : abs_level; |
| 1556 | int dqv = get_dqv(dequant, blk_pos, iqmatrix); |
| 1557 | int dq = prev_id >= 0 ? tcq_quant(prev_id) : 0; |
| 1558 | int qc = (abs_level == 0) ? 0 : (2 * abs_level - dq); |
| 1559 | int dqc = (tran_low_t)ROUND_POWER_OF_TWO_64((tran_high_t)qc * dqv, |
| 1560 | QUANT_TABLE_BITS) >> |
| 1561 | log_scale; |
| 1562 | dqcoeff[blk_pos] = sign ? -dqc : dqc; |
| 1563 | } |
| 1564 | } |
| 1565 | int eob = scan_pos; |
| 1566 | |
| 1567 | for (; scan_pos <= first_scan_pos; scan_pos++) { |
| 1568 | int blk_pos = scan[scan_pos]; |
| 1569 | qcoeff[blk_pos] = 0; |
| 1570 | dqcoeff[blk_pos] = 0; |
| 1571 | } |
| 1572 | |
| 1573 | *min_rate = trel_min_rate; |
| 1574 | *min_cost = min_path_cost; |
| 1575 | return eob; |
| 1576 | } |