blob: 6563257c34bb0a1ba1701c1d38f30e0d4dc88605 [file] [log] [blame]
Joe Young463ba7f2024-06-25 14:27:02 -07001/*
2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * aomedia.org/license/patent-license/.
11 */
12
13#include <assert.h>
14#include <immintrin.h> /* AVX2 */
15
16#include "aom/aom_integer.h"
17#include "aom_dsp/x86/mem_sse2.h"
18#include "av1/common/av1_common_int.h"
19#include "av1/common/quant_common.h"
20#include "av1/encoder/trellis_quant.h"
21#include "aom_dsp/x86/synonyms.h"
22#include "aom_dsp/x86/synonyms_avx2.h"
23
Joe Young3db806d2024-09-17 09:47:11 -070024// av1_decide_states_*() constants.
25static const int32_t kShuffle[8] = { 0, 2, 1, 3, 5, 7, 4, 6 };
26static const int32_t kPrevId[TCQ_MAX_STATES / 4][8] = {
27 { 0, 0 << 24, 0, 1 << 24, 0, 2 << 24, 0, 3 << 24 },
28 { 0, 4 << 24, 0, 5 << 24, 0, 6 << 24, 0, 7 << 24 },
29};
30
31// av1_calc_lf_ctx_*() constants.
32// Neighbor mask for calculating context sum (base/mid).
33#define M MAX_VAL_BR_CTX
34static const int8_t kNbrMask[4][32] = {
35 { 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // diag 0
36 M, M, 0, M, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
37 { 0, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, // diag 1
38 0, M, M, 0, 0, M, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
39 { 0, 0, 5, 5, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, // diag 2
40 0, 0, M, M, 0, 0, 0, M, 0, 0, 0, 0, 0, 0, 0, 0 },
41 { 0, 0, 0, 5, 5, 0, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, // diag 3
42 0, 0, 0, M, M, 0, 0, 0, 0, M, 0, 0, 0, 0, 0, 0 },
43};
44static const int8_t kMaxCtx[16] = { 8, 6, 6, 4, 4, 4, 4, 4,
45 4, 4, 4, 4, 4, 4, 4, 4 };
46static const int8_t kScanDiag[MAX_LF_SCAN] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
47
Joe Young463ba7f2024-06-25 14:27:02 -070048void av1_decide_states_avx2(const struct tcq_node_t *prev,
Joe Youngee165d22024-08-19 10:57:07 -070049 const struct tcq_rate_t *rd,
Joe Young3db806d2024-09-17 09:47:11 -070050 const struct prequant_t *pq, int n_states,
51 int limits, int try_eob, int64_t rdmult,
Joe Youngee165d22024-08-19 10:57:07 -070052 struct tcq_node_t *decision) {
Joe Young463ba7f2024-06-25 14:27:02 -070053 (void)limits;
Joe Young463ba7f2024-06-25 14:27:02 -070054 assert((rdmult >> 32) == 0);
55 assert(sizeof(tcq_node_t) == 16);
56
57 __m256i c_rdmult = _mm256_set1_epi64x(rdmult);
58 __m256i c_round = _mm256_set1_epi64x(1 << (AV1_PROB_COST_SHIFT - 1));
59 __m256i c_zero = _mm256_setzero_si256();
60
61 // Gather absolute coeff level for 4 possible quant options.
62 __m128i abslev0123 = _mm_lddqu_si128((__m128i *)pq->absLevel);
63 __m256i abslev0231 =
64 _mm256_castsi128_si256(_mm_shuffle_epi32(abslev0123, 0x78));
65 __m256i abslev02023131 = _mm256_permute4x64_epi64(abslev0231, 0x50);
66 __m256i abslev00223311 = _mm256_shuffle_epi32(abslev02023131, 0x50);
67 __m256i abslev0033 = _mm256_unpacklo_epi32(c_zero, abslev00223311);
68 __m256i abslev2211 = _mm256_unpackhi_epi32(c_zero, abslev00223311);
69
Joe Young3db806d2024-09-17 09:47:11 -070070 __m256i *out_a = (__m256i *)&decision[0];
71 __m256i *out_b = (__m256i *)&decision[n_states >> 1];
72
73 for (int i = 0; i < n_states >> 2; i++) {
Joe Youngee165d22024-08-19 10:57:07 -070074 // Load distortion.
75 __m256i dist = _mm256_lddqu_si256((__m256i *)&pq->deltaDist[0]);
76 dist = _mm256_slli_epi64(dist, RDDIV_BITS);
77 __m256i dist0033 = _mm256_permute4x64_epi64(dist, 0xF0);
78 __m256i dist2211 = _mm256_permute4x64_epi64(dist, 0x5A);
Joe Young463ba7f2024-06-25 14:27:02 -070079
80 // Calc rate-distortion costs for each pair of even/odd quant.
81 // Separate candidates into even and odd quant decisions
82 // Even indexes: { 0, 2, 5, 7 }. Odd: { 1, 3, 4, 6 }.
Joe Youngee165d22024-08-19 10:57:07 -070083 __m256i rates = _mm256_lddqu_si256((__m256i *)&rd->rate[8 * i]);
Joe Young463ba7f2024-06-25 14:27:02 -070084 __m256i permute_mask = _mm256_lddqu_si256((__m256i *)kShuffle);
85 __m256i rate02135746 = _mm256_permutevar8x32_epi32(rates, permute_mask);
86 __m256i rate0257 = _mm256_unpacklo_epi32(rate02135746, c_zero);
87 __m256i rate1346 = _mm256_unpackhi_epi32(rate02135746, c_zero);
88 __m256i rdcost0257 = _mm256_mul_epu32(c_rdmult, rate0257);
89 __m256i rdcost1346 = _mm256_mul_epu32(c_rdmult, rate1346);
90 rdcost0257 = _mm256_add_epi64(rdcost0257, c_round);
91 rdcost1346 = _mm256_add_epi64(rdcost1346, c_round);
92 rdcost0257 = _mm256_srli_epi64(rdcost0257, AV1_PROB_COST_SHIFT);
93 rdcost1346 = _mm256_srli_epi64(rdcost1346, AV1_PROB_COST_SHIFT);
Joe Youngee165d22024-08-19 10:57:07 -070094 rdcost0257 = _mm256_add_epi64(rdcost0257, dist0033);
95 rdcost1346 = _mm256_add_epi64(rdcost1346, dist2211);
Joe Young463ba7f2024-06-25 14:27:02 -070096
97 // Calc rd-cost for zero quant.
Joe Youngee165d22024-08-19 10:57:07 -070098 __m256i ratezero = _mm256_castsi128_si256(
99 _mm_lddqu_si128((__m128i *)&rd->rate_zero[4 * i]));
Joe Young463ba7f2024-06-25 14:27:02 -0700100 ratezero = _mm256_permute4x64_epi64(ratezero, 0x50);
101 ratezero = _mm256_unpacklo_epi32(ratezero, c_zero);
102 __m256i rdcostzero = _mm256_mul_epu32(c_rdmult, ratezero);
103 rdcostzero = _mm256_add_epi64(rdcostzero, c_round);
104 rdcostzero = _mm256_srli_epi64(rdcostzero, AV1_PROB_COST_SHIFT);
105
106 // Add previous state rdCost to rdcostzero
107 __m256i state01 = _mm256_lddqu_si256((__m256i *)&prev[4 * i]);
108 __m256i state23 = _mm256_lddqu_si256((__m256i *)&prev[4 * i + 2]);
109 __m256i state02 = _mm256_permute2x128_si256(state01, state23, 0x20);
110 __m256i state13 = _mm256_permute2x128_si256(state01, state23, 0x31);
111 __m256i prevrd0123 = _mm256_unpacklo_epi64(state02, state13);
112 __m256i prevrate0123 = _mm256_unpackhi_epi64(state02, state13);
113 prevrate0123 = _mm256_slli_epi64(prevrate0123, 32);
114 prevrate0123 = _mm256_srli_epi64(prevrate0123, 32);
115
116 // Compare rd costs (Zero vs Even).
117 __m256i use_zero = _mm256_cmpgt_epi64(rdcost0257, rdcostzero);
118 rdcost0257 = _mm256_blendv_epi8(rdcost0257, rdcostzero, use_zero);
119 rate0257 = _mm256_blendv_epi8(rate0257, ratezero, use_zero);
120 __m256i abslev_even = _mm256_andnot_si256(use_zero, abslev0033);
121
122 // Add previous state rdCost to current rdcost
123 rdcost0257 = _mm256_add_epi64(rdcost0257, prevrd0123);
124 rdcost1346 = _mm256_add_epi64(rdcost1346, prevrd0123);
125 rate0257 = _mm256_add_epi64(rate0257, prevrate0123);
126 rate1346 = _mm256_add_epi64(rate1346, prevrate0123);
127
128 // Compare rd costs (Even vs Odd).
129 __m256i rdcost3164 = _mm256_shuffle_epi32(rdcost1346, 0x4E);
130 __m256i rate3164 = _mm256_shuffle_epi32(rate1346, 0x4E);
131 __m256i use_odd = _mm256_cmpgt_epi64(rdcost0257, rdcost3164);
Joe Young463ba7f2024-06-25 14:27:02 -0700132 __m256i use_odd_1 = _mm256_slli_epi64(_mm256_srli_epi64(use_odd, 63), 56);
Joe Young3db806d2024-09-17 09:47:11 -0700133 __m256i prev_id = _mm256_lddqu_si256((__m256i *)kPrevId[i]);
Joe Young463ba7f2024-06-25 14:27:02 -0700134 prev_id = _mm256_xor_si256(prev_id, use_odd_1);
135 __m256i rdcost_best = _mm256_blendv_epi8(rdcost0257, rdcost3164, use_odd);
136 __m256i rate_best = _mm256_blendv_epi8(rate0257, rate3164, use_odd);
137 __m256i abslev_best = _mm256_blendv_epi8(abslev_even, abslev2211, use_odd);
138
Joe Youngee165d22024-08-19 10:57:07 -0700139 // Compare rd costs (best vs new eob).
140 __m256i rate_eob = _mm256_castsi128_si256(_mm_loadu_si64(rd->rate_eob));
141 rate_eob = _mm256_unpacklo_epi32(rate_eob, c_zero);
142 __m256i rdcost_eob = _mm256_mul_epu32(c_rdmult, rate_eob);
143 rdcost_eob = _mm256_add_epi64(rdcost_eob, c_round);
144 rdcost_eob = _mm256_srli_epi64(rdcost_eob, AV1_PROB_COST_SHIFT);
145 __m256i dist_eob = _mm256_unpacklo_epi64(dist0033, dist2211);
146 rdcost_eob = _mm256_add_epi64(rdcost_eob, dist_eob);
147 __m128i mask_eob0 = _mm_set1_epi64x((int64_t)-try_eob);
148 __m256i mask_eob = _mm256_inserti128_si256(c_zero, mask_eob0, 0);
149 __m256i use_eob = _mm256_cmpgt_epi64(rdcost_best, rdcost_eob);
150 use_eob = _mm256_and_si256(use_eob, mask_eob);
151 __m256i use_eob_1 = _mm256_slli_epi64(use_eob, 56);
152 prev_id = _mm256_or_si256(prev_id, use_eob_1);
153 rdcost_best = _mm256_blendv_epi8(rdcost_best, rdcost_eob, use_eob);
154 rate_best = _mm256_blendv_epi8(rate_best, rate_eob, use_eob);
155 __m256i abslev_eob = _mm256_unpacklo_epi64(abslev0033, abslev2211);
156 abslev_best = _mm256_blendv_epi8(abslev_best, abslev_eob, use_eob);
157 try_eob = 0;
158
Joe Young463ba7f2024-06-25 14:27:02 -0700159 // Pack and store state info.
160 __m256i info_best = _mm256_or_si256(rate_best, abslev_best);
161 info_best = _mm256_or_si256(info_best, prev_id);
162 __m256i info01 = _mm256_unpacklo_epi64(rdcost_best, info_best);
163 __m256i info23 = _mm256_unpackhi_epi64(rdcost_best, info_best);
Joe Young3db806d2024-09-17 09:47:11 -0700164 _mm256_storeu_si256(out_a, info01);
165 _mm256_storeu_si256(out_b, info23);
166 out_a = (__m256i *)&decision[6];
167 out_b = (__m256i *)&decision[2];
Joe Young463ba7f2024-06-25 14:27:02 -0700168 }
169}
170
Joe Young3db806d2024-09-17 09:47:11 -0700171void av1_decide_states_st4_avx2(const struct tcq_node_t *prev,
172 const struct tcq_rate_t *rd,
173 const struct prequant_t *pq, int n_states,
174 int limits, int try_eob, int64_t rdmult,
175 struct tcq_node_t *decision) {
176 (void)limits;
177 (void)n_states;
178 assert(n_states == 4);
179 assert((rdmult >> 32) == 0);
180 assert(sizeof(tcq_node_t) == 16);
181
182 int i = 0;
183
184 __m256i c_rdmult = _mm256_set1_epi64x(rdmult);
185 __m256i c_round = _mm256_set1_epi64x(1 << (AV1_PROB_COST_SHIFT - 1));
186 __m256i c_zero = _mm256_setzero_si256();
187
188 // Gather absolute coeff level for 4 possible quant options.
189 __m128i abslev0123 = _mm_lddqu_si128((__m128i *)pq->absLevel);
190 __m256i abslev0231 =
191 _mm256_castsi128_si256(_mm_shuffle_epi32(abslev0123, 0x78));
192 __m256i abslev02023131 = _mm256_permute4x64_epi64(abslev0231, 0x50);
193 __m256i abslev00223311 = _mm256_shuffle_epi32(abslev02023131, 0x50);
194 __m256i abslev0033 = _mm256_unpacklo_epi32(c_zero, abslev00223311);
195 __m256i abslev2211 = _mm256_unpackhi_epi32(c_zero, abslev00223311);
196
197 // Load distortion.
198 __m256i dist = _mm256_lddqu_si256((__m256i *)&pq->deltaDist[0]);
199 dist = _mm256_slli_epi64(dist, RDDIV_BITS);
200 __m256i dist0033 = _mm256_permute4x64_epi64(dist, 0xF0);
201 __m256i dist2211 = _mm256_permute4x64_epi64(dist, 0x5A);
202
203 // Calc rate-distortion costs for each pair of even/odd quant.
204 // Separate candidates into even and odd quant decisions
205 // Even indexes: { 0, 2, 5, 7 }. Odd: { 1, 3, 4, 6 }.
206 __m256i rates = _mm256_lddqu_si256((__m256i *)&rd->rate[8 * i]);
207 __m256i permute_mask = _mm256_lddqu_si256((__m256i *)kShuffle);
208 __m256i rate02135746 = _mm256_permutevar8x32_epi32(rates, permute_mask);
209 __m256i rate0257 = _mm256_unpacklo_epi32(rate02135746, c_zero);
210 __m256i rate1346 = _mm256_unpackhi_epi32(rate02135746, c_zero);
211 __m256i rdcost0257 = _mm256_mul_epu32(c_rdmult, rate0257);
212 __m256i rdcost1346 = _mm256_mul_epu32(c_rdmult, rate1346);
213 rdcost0257 = _mm256_add_epi64(rdcost0257, c_round);
214 rdcost1346 = _mm256_add_epi64(rdcost1346, c_round);
215 rdcost0257 = _mm256_srli_epi64(rdcost0257, AV1_PROB_COST_SHIFT);
216 rdcost1346 = _mm256_srli_epi64(rdcost1346, AV1_PROB_COST_SHIFT);
217 rdcost0257 = _mm256_add_epi64(rdcost0257, dist0033);
218 rdcost1346 = _mm256_add_epi64(rdcost1346, dist2211);
219
220 // Calc rd-cost for zero quant.
221 __m256i ratezero =
222 _mm256_castsi128_si256(_mm_lddqu_si128((__m128i *)&rd->rate_zero[4 * i]));
223 ratezero = _mm256_permute4x64_epi64(ratezero, 0x50);
224 ratezero = _mm256_unpacklo_epi32(ratezero, c_zero);
225 __m256i rdcostzero = _mm256_mul_epu32(c_rdmult, ratezero);
226 rdcostzero = _mm256_add_epi64(rdcostzero, c_round);
227 rdcostzero = _mm256_srli_epi64(rdcostzero, AV1_PROB_COST_SHIFT);
228
229 // Add previous state rdCost to rdcostzero
230 __m256i state01 = _mm256_lddqu_si256((__m256i *)&prev[4 * i]);
231 __m256i state23 = _mm256_lddqu_si256((__m256i *)&prev[4 * i + 2]);
232 __m256i state02 = _mm256_permute2x128_si256(state01, state23, 0x20);
233 __m256i state13 = _mm256_permute2x128_si256(state01, state23, 0x31);
234 __m256i prevrd0123 = _mm256_unpacklo_epi64(state02, state13);
235 __m256i prevrate0123 = _mm256_unpackhi_epi64(state02, state13);
236 prevrate0123 = _mm256_slli_epi64(prevrate0123, 32);
237 prevrate0123 = _mm256_srli_epi64(prevrate0123, 32);
238
239 // Compare rd costs (Zero vs Even).
240 __m256i use_zero = _mm256_cmpgt_epi64(rdcost0257, rdcostzero);
241 rdcost0257 = _mm256_blendv_epi8(rdcost0257, rdcostzero, use_zero);
242 rate0257 = _mm256_blendv_epi8(rate0257, ratezero, use_zero);
243 __m256i abslev_even = _mm256_andnot_si256(use_zero, abslev0033);
244
245 // Add previous state rdCost to current rdcost
246 rdcost0257 = _mm256_add_epi64(rdcost0257, prevrd0123);
247 rdcost1346 = _mm256_add_epi64(rdcost1346, prevrd0123);
248 rate0257 = _mm256_add_epi64(rate0257, prevrate0123);
249 rate1346 = _mm256_add_epi64(rate1346, prevrate0123);
250
251 // Compare rd costs (Even vs Odd).
252 __m256i rdcost3164 = _mm256_shuffle_epi32(rdcost1346, 0x4E);
253 __m256i rate3164 = _mm256_shuffle_epi32(rate1346, 0x4E);
254 __m256i use_odd = _mm256_cmpgt_epi64(rdcost0257, rdcost3164);
255 __m256i use_odd_1 = _mm256_slli_epi64(_mm256_srli_epi64(use_odd, 63), 56);
256 __m256i prev_id = _mm256_lddqu_si256((__m256i *)kPrevId[i]);
257 prev_id = _mm256_xor_si256(prev_id, use_odd_1);
258 __m256i rdcost_best = _mm256_blendv_epi8(rdcost0257, rdcost3164, use_odd);
259 __m256i rate_best = _mm256_blendv_epi8(rate0257, rate3164, use_odd);
260 __m256i abslev_best = _mm256_blendv_epi8(abslev_even, abslev2211, use_odd);
261
262 // Compare rd costs (best vs new eob).
263 __m256i rate_eob = _mm256_castsi128_si256(_mm_loadu_si64(rd->rate_eob));
264 rate_eob = _mm256_unpacklo_epi32(rate_eob, c_zero);
265 __m256i rdcost_eob = _mm256_mul_epu32(c_rdmult, rate_eob);
266 rdcost_eob = _mm256_add_epi64(rdcost_eob, c_round);
267 rdcost_eob = _mm256_srli_epi64(rdcost_eob, AV1_PROB_COST_SHIFT);
268 __m256i dist_eob = _mm256_unpacklo_epi64(dist0033, dist2211);
269 rdcost_eob = _mm256_add_epi64(rdcost_eob, dist_eob);
270 __m128i mask_eob0 = _mm_set1_epi64x((int64_t)-try_eob);
271 __m256i mask_eob = _mm256_inserti128_si256(c_zero, mask_eob0, 0);
272 __m256i use_eob = _mm256_cmpgt_epi64(rdcost_best, rdcost_eob);
273 use_eob = _mm256_and_si256(use_eob, mask_eob);
274 __m256i use_eob_1 = _mm256_slli_epi64(use_eob, 56);
275 prev_id = _mm256_or_si256(prev_id, use_eob_1);
276 rdcost_best = _mm256_blendv_epi8(rdcost_best, rdcost_eob, use_eob);
277 rate_best = _mm256_blendv_epi8(rate_best, rate_eob, use_eob);
278 __m256i abslev_eob = _mm256_unpacklo_epi64(abslev0033, abslev2211);
279 abslev_best = _mm256_blendv_epi8(abslev_best, abslev_eob, use_eob);
280
281 // Pack and store state info.
282 __m256i info_best = _mm256_or_si256(rate_best, abslev_best);
283 info_best = _mm256_or_si256(info_best, prev_id);
284 __m256i info01 = _mm256_unpacklo_epi64(rdcost_best, info_best);
285 __m256i info23 = _mm256_unpackhi_epi64(rdcost_best, info_best);
286 __m256i *out_a = (__m256i *)&decision[0];
287 __m256i *out_b = (__m256i *)&decision[2];
288 _mm256_storeu_si256(out_a, info01);
289 _mm256_storeu_si256(out_b, info23);
290}
291
Joe Young463ba7f2024-06-25 14:27:02 -0700292void av1_pre_quant_avx2(tran_low_t tqc, struct prequant_t *pqData,
293 const int32_t *quant_ptr, int dqv, int log_scale,
294 int scan_pos) {
295 static const int32_t kInc[4][4] = {
296 { 0, 1, 2, 3 }, { 3, 0, 1, 2 }, { 2, 3, 0, 1 }, { 1, 2, 3, 0 }
297 };
298
299 // calculate qIdx
300 int shift = 16 - log_scale + QUANT_FP_BITS;
301 int32_t add = -((2 << shift) >> 1);
302 int32_t abs_tqc = abs(tqc);
303
304 int32_t qIdx = (int)AOMMAX(
305 1, AOMMIN(((1 << 16) - 1),
306 ((int64_t)abs_tqc * quant_ptr[scan_pos != 0] + add) >> shift));
307 pqData->qIdx = qIdx;
308
309 __m256i c_zero = _mm256_setzero_si256();
310 __m128i base_qc = _mm_set1_epi32(qIdx);
311 __m128i qc_inc = _mm_lddqu_si128((__m128i *)kInc[qIdx & 3]);
312 __m128i qc_idx = _mm_add_epi32(base_qc, qc_inc);
313 __m128i one = _mm_set1_epi32(1);
314 __m128i abslev = _mm_add_epi32(qc_idx, one);
315 abslev = _mm_srli_epi32(abslev, 1);
316 _mm_storeu_si128((__m128i *)pqData->absLevel, abslev);
317
318 __m256i qc_idx1 = _mm256_castsi128_si256(qc_idx);
319 __m256i qc_idx_01012323 = _mm256_permute4x64_epi64(qc_idx1, 0x50);
320 __m256i qc_idx_0123 = _mm256_unpacklo_epi32(qc_idx_01012323, c_zero);
321 __m256i c_dqv = _mm256_set1_epi64x(dqv);
322 __m256i qc_mul_dqv = _mm256_mul_epu32(qc_idx_0123, c_dqv);
323 __m256i dq_round = _mm256_set1_epi64x(1 << (QUANT_TABLE_BITS - 1));
324 __m256i qc_mul_dqv_rnd = _mm256_add_epi64(qc_mul_dqv, dq_round);
325 __m256i dq_shift = _mm256_set1_epi64x(log_scale + QUANT_TABLE_BITS);
326 __m256i dqc = _mm256_srlv_epi64(qc_mul_dqv_rnd, dq_shift);
327
328 __m256i abs_tqc_sh = _mm256_set1_epi64x(abs_tqc << (log_scale - 1));
329 __m256i dist0 = _mm256_mul_epi32(abs_tqc_sh, abs_tqc_sh);
330 __m256i scale_shift = _mm256_set1_epi64x(log_scale - 1);
331 __m256i dqc_sh = _mm256_sllv_epi32(dqc, scale_shift);
332 __m256i diff = _mm256_sub_epi32(dqc_sh, abs_tqc_sh);
333 __m256i dist = _mm256_mul_epi32(diff, diff);
334 dist = _mm256_sub_epi64(dist, dist0);
335 _mm256_storeu_si256((__m256i *)pqData->deltaDist, dist);
336}
337
Joe Young3db806d2024-09-17 09:47:11 -0700338void av1_update_states_avx2(tcq_node_t *decision, int scan_idx, int n_states,
Joe Young089e2f82024-08-23 13:51:27 -0700339 const struct tcq_ctx_t *cur_ctx,
340 struct tcq_ctx_t *nxt_ctx) {
Joe Young3db806d2024-09-17 09:47:11 -0700341 for (int i = 0; i < n_states; i++) {
Joe Youngaf03d882024-08-13 11:11:36 -0700342 int prevId = decision[i].prevId;
343 int absLevel = decision[i].absLevel;
Joe Young089e2f82024-08-23 13:51:27 -0700344 if (prevId >= 0) {
345 memcpy(&nxt_ctx[i], &cur_ctx[prevId], sizeof(tcq_ctx_t));
346 } else {
Joe Youngaf03d882024-08-13 11:11:36 -0700347 // New EOB; reset contexts
Joe Young089e2f82024-08-23 13:51:27 -0700348 memset(&nxt_ctx[i], 0, sizeof(tcq_ctx_t));
349 nxt_ctx[i].orig_id = -1;
Joe Youngaf03d882024-08-13 11:11:36 -0700350 }
Joe Young089e2f82024-08-23 13:51:27 -0700351 nxt_ctx[i].lev[scan_idx] = AOMMIN(absLevel, INT8_MAX);
Joe Youngaf03d882024-08-13 11:11:36 -0700352 }
353}
354
Joe Young463ba7f2024-06-25 14:27:02 -0700355void av1_calc_diag_ctx_avx2(int scan_hi, int scan_lo, int bwl,
356 const uint8_t *prev_levels, const int16_t *scan,
357 uint8_t *ctx) {
358#define M MAX_VAL_BR_CTX
359 static const int8_t kClip[2][16] = {
360 { 0, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 0, 0 },
361 { 0, 0, M, 0, M, M, 0, 0, M, 0, M, M, 0, 0, 0, 0 },
362 };
363#undef M
364 int n_ctx = scan_hi - scan_lo + 1;
365 __m128i zero = _mm_setzero_si128();
366 __m128i one = _mm_set1_epi8(1);
367 __m128i four = _mm_set1_epi8(4);
368 __m128i six = _mm_set1_epi8(6);
369 __m128i clip = _mm_lddqu_si128((__m128i *)&kClip[0][0]);
370 __m128i clip_mid = _mm_lddqu_si128((__m128i *)&kClip[1][0]);
371
372 int blk_pos = scan[scan_lo];
373 int row_inc = (1 << bwl) + (1 << TX_PAD_HOR_LOG2) - 1;
374 const uint8_t *row_ptr = prev_levels + get_padded_idx(blk_pos, bwl) + 1;
375 const uint8_t *min_row_ptr = prev_levels;
376 __m128i nbr2 = _mm_loadu_si64(&row_ptr[row_inc]);
377 __m128i nbr3 = _mm_loadu_si64(&row_ptr[2 * row_inc]);
378 __m128i nbr23 = _mm_unpacklo_epi16(nbr2, nbr3);
379
380 for (int i = 0; i < n_ctx; i += 2) {
381 const uint8_t *p1 = AOMMAX(min_row_ptr, &row_ptr[-row_inc]);
382 __m128i nbr0 = _mm_loadu_si64(p1);
383 __m128i nbr1 = _mm_loadu_si64(&row_ptr[0]);
384 __m128i nbr01 = _mm_unpacklo_epi16(nbr0, nbr1);
385 __m128i nbr0123 = _mm_unpacklo_epi32(nbr01, nbr23);
386 __m128i nbr = _mm_unpacklo_epi64(nbr0123, nbr0123);
387 __m128i nbr_max = _mm_min_epu8(nbr, clip);
388 __m128i sum = _mm_maddubs_epi16(nbr_max, one);
389 sum = _mm_hadd_epi16(sum, zero);
390 sum = _mm_hadd_epi16(sum, zero);
391 __m128i coeff_ctx = _mm_packs_epi16(sum, sum);
392 coeff_ctx = _mm_avg_epu8(coeff_ctx, zero);
393 coeff_ctx = _mm_min_epi8(coeff_ctx, four);
394 __m128i nbr_max_mid = _mm_min_epu8(nbr, clip_mid);
395 __m128i sum_mid = _mm_maddubs_epi16(nbr_max_mid, one);
396 sum_mid = _mm_hadd_epi16(sum_mid, zero);
397 sum_mid = _mm_hadd_epi16(sum_mid, zero);
398 __m128i coeff_mid_ctx = _mm_packs_epi16(sum_mid, sum_mid);
399 coeff_mid_ctx = _mm_avg_epu8(coeff_mid_ctx, zero);
400 coeff_mid_ctx = _mm_min_epi8(coeff_mid_ctx, six);
401 coeff_mid_ctx = _mm_slli_epi16(coeff_mid_ctx, 4);
402 coeff_ctx = _mm_add_epi8(coeff_ctx, coeff_mid_ctx);
403 uint16_t ctx01 = _mm_extract_epi16(coeff_ctx, 0);
404 uint16_t ctx1 = ctx01 >> 8;
405 ctx[i] = (uint8_t)ctx01;
406 ctx[i + 1] = ctx1;
407 row_ptr -= 2 * row_inc;
408 nbr23 = nbr01;
409 }
410}
411
Joe Youngbdde8682024-08-07 09:33:48 -0700412static INLINE int get_mid_cost_def(tran_low_t abs_qc, int coeff_ctx,
413 const LV_MAP_COEFF_COST *txb_costs,
414 int plane, int t_sign, int sign) {
Joe Young463ba7f2024-06-25 14:27:02 -0700415 int cost = 0;
Joe Youngbdde8682024-08-07 09:33:48 -0700416 if (plane == AOM_PLANE_V) {
417 cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1);
418 }
Joe Young463ba7f2024-06-25 14:27:02 -0700419 if (abs_qc > NUM_BASE_LEVELS) {
420 int mid_ctx = coeff_ctx >> 4;
Joe Youngbdde8682024-08-07 09:33:48 -0700421 if (plane == 0) {
Joe Younge051e842024-08-28 14:27:48 -0700422 cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700423 } else {
Joe Younge051e842024-08-28 14:27:48 -0700424 cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost_uv[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700425 }
Joe Young463ba7f2024-06-25 14:27:02 -0700426 }
427 return cost;
428}
429
Joe Youngee165d22024-08-19 10:57:07 -0700430static INLINE int get_mid_cost_eob(int ci, int limits, int is_dc,
431 tran_low_t abs_qc, int sign, int dc_sign_ctx,
Joe Young19695f32024-08-16 15:12:02 -0700432 const LV_MAP_COEFF_COST *txb_costs,
Joe Youngee165d22024-08-19 10:57:07 -0700433 TX_CLASS tx_class, int32_t t_sign,
434 int plane) {
Joe Young19695f32024-08-16 15:12:02 -0700435 int cost = 0;
436 const int dc_ph_group = 0; // PH disabled
437
438 if (limits) {
439 if (is_dc) {
440 cost -= av1_cost_literal(1);
441 if (plane == AOM_PLANE_V) {
442 cost += txb_costs->v_dc_sign_cost[t_sign][dc_sign_ctx][sign];
443 } else {
444 cost += txb_costs->dc_sign_cost[dc_ph_group][dc_sign_ctx][sign];
445 }
446 } else {
447 if (plane == AOM_PLANE_V) {
Joe Youngee165d22024-08-19 10:57:07 -0700448 cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1);
Joe Young19695f32024-08-16 15:12:02 -0700449 }
450 }
451 if (plane > 0) {
452 if (abs_qc > LF_NUM_BASE_LEVELS) {
453 int br_ctx = get_br_ctx_lf_eob_chroma(ci, tx_class);
Joe Younge051e842024-08-28 14:27:48 -0700454 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[br_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700455 }
456 } else {
457 if (abs_qc > LF_NUM_BASE_LEVELS) {
458 int br_ctx = get_br_ctx_lf_eob(ci, tx_class);
Joe Younge051e842024-08-28 14:27:48 -0700459 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[br_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700460 }
461 }
462 } else {
463 if (plane == AOM_PLANE_V) {
Joe Youngee165d22024-08-19 10:57:07 -0700464 cost += txb_costs->v_ac_sign_cost[t_sign][sign] - av1_cost_literal(1);
Joe Young19695f32024-08-16 15:12:02 -0700465 }
466 if (plane > 0) {
467 if (abs_qc > NUM_BASE_LEVELS) {
468 int br_ctx = 0; /* get_br_ctx_eob_chroma */
Joe Younge051e842024-08-28 14:27:48 -0700469 cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost_uv[br_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700470 }
471 } else {
472 if (abs_qc > NUM_BASE_LEVELS) {
473 int br_ctx = 0; /* get_br_ctx_eob */
Joe Younge051e842024-08-28 14:27:48 -0700474 cost += get_br_cost_tcq(abs_qc, txb_costs->lps_cost[br_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700475 }
476 }
477 }
478 return cost;
479}
480
Joe Youngbdde8682024-08-07 09:33:48 -0700481static int get_mid_cost_lf_dc(int ci, tran_low_t abs_qc, int sign,
482 int coeff_ctx, int dc_sign_ctx,
483 const LV_MAP_COEFF_COST *txb_costs,
484 const int32_t *tmp_sign, int plane) {
485 int cost = 0;
486 int mid_ctx = coeff_ctx >> 4;
487 const int dc_ph_group = 0; // PH disabled
488 cost -= av1_cost_literal(1); // Remove previously added sign cost.
489 if (plane == AOM_PLANE_V)
490 cost += txb_costs->v_dc_sign_cost[tmp_sign[ci]][dc_sign_ctx][sign];
491 else
492 cost += txb_costs->dc_sign_cost[dc_ph_group][dc_sign_ctx][sign];
493 if (plane > 0) {
494 if (abs_qc > LF_NUM_BASE_LEVELS) {
Joe Younge051e842024-08-28 14:27:48 -0700495 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700496 }
497 } else {
498 if (abs_qc > LF_NUM_BASE_LEVELS) {
Joe Younge051e842024-08-28 14:27:48 -0700499 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700500 }
501 }
502 return cost;
503}
504
505static int get_mid_cost_lf(tran_low_t abs_qc, int coeff_ctx,
506 const LV_MAP_COEFF_COST *txb_costs, int plane) {
507 int cost = 0;
508 int mid_ctx = coeff_ctx >> 4;
509#if 1
510 assert(plane == 0);
511 (void)plane;
512 if (abs_qc > LF_NUM_BASE_LEVELS) {
Joe Younge051e842024-08-28 14:27:48 -0700513 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700514 }
515#else
516 if (plane > 0) {
517 if (abs_qc > LF_NUM_BASE_LEVELS) {
Joe Younge051e842024-08-28 14:27:48 -0700518 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost_uv[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700519 }
520 } else {
521 if (abs_qc > LF_NUM_BASE_LEVELS) {
Joe Younge051e842024-08-28 14:27:48 -0700522 cost += get_br_lf_cost_tcq(abs_qc, txb_costs->lps_lf_cost[mid_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -0700523 }
524 }
525#endif
526 return cost;
527}
528
Joe Young19695f32024-08-16 15:12:02 -0700529void av1_get_rate_dist_def_luma_avx2(const struct LV_MAP_COEFF_COST *txb_costs,
530 const struct prequant_t *pq,
Joe Youngd3ef83a2024-08-23 11:49:06 -0700531 const tcq_coeff_ctx_t *coeff_ctx,
Joe Youngee165d22024-08-19 10:57:07 -0700532 int blk_pos, int bwl, TX_CLASS tx_class,
Joe Young3db806d2024-09-17 09:47:11 -0700533 int diag_ctx, int eob_rate, int n_states,
Joe Youngee165d22024-08-19 10:57:07 -0700534 struct tcq_rate_t *rd) {
535 (void)bwl;
Joe Youngfb9653f2024-08-28 08:35:13 -0700536 const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_zero;
Joe Young19695f32024-08-16 15:12:02 -0700537 const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] =
538 txb_costs->base_cost_low_tbl;
539 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
540 txb_costs->base_eob_cost_tbl;
541 const tran_low_t *absLevel = pq->absLevel;
Joe Young19695f32024-08-16 15:12:02 -0700542
543 // Calc zero coeff costs.
544 __m256i zero = _mm256_setzero_si256();
545 __m256i cost_zero_dq0 =
Joe Youngfb9653f2024-08-28 08:35:13 -0700546 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700547 __m256i cost_zero_dq1 =
Joe Youngfb9653f2024-08-28 08:35:13 -0700548 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
Joe Young19695f32024-08-16 15:12:02 -0700549
Joe Youngd3ef83a2024-08-23 11:49:06 -0700550 __m256i coef_ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
551 __m256i ctx16 = _mm256_unpacklo_epi8(coef_ctx, zero);
552 __m256i ctx = _mm256_shuffle_epi32(ctx16, 0xD8);
Joe Young19695f32024-08-16 15:12:02 -0700553 __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx, zero);
554 __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx, zero);
555 __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0);
556 __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1);
557 __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1);
558 _mm_storeu_si128((__m128i *)&rd->rate_zero[0],
559 _mm256_castsi256_si128(ratez_0123));
Joe Young19695f32024-08-16 15:12:02 -0700560 __m256i ratez_4567 = _mm256_unpackhi_epi64(ratez_dq0, ratez_dq1);
561 _mm_storeu_si128((__m128i *)&rd->rate_zero[4],
562 _mm256_castsi256_si128(ratez_4567));
Joe Young19695f32024-08-16 15:12:02 -0700563
564 // Calc coeff_base rate.
Joe Young19695f32024-08-16 15:12:02 -0700565 int idx = AOMMIN(pq->qIdx - 1, 4);
Joe Youngd3ef83a2024-08-23 11:49:06 -0700566 __m128i c_zero = _mm_setzero_si128();
567 __m256i diag = _mm256_set1_epi16(diag_ctx);
568 __m256i base_ctx = _mm256_slli_epi16(ctx16, 12);
569 base_ctx = _mm256_srli_epi16(base_ctx, 12);
570 base_ctx = _mm256_add_epi16(base_ctx, diag);
Joe Young3db806d2024-09-17 09:47:11 -0700571 for (int i = 0; i < (n_states >> 2); i++) {
Joe Youngd3ef83a2024-08-23 11:49:06 -0700572 int ctx0 = _mm256_extract_epi16(base_ctx, 0);
573 int ctx1 = _mm256_extract_epi16(base_ctx, 1);
574 int ctx2 = _mm256_extract_epi16(base_ctx, 2);
575 int ctx3 = _mm256_extract_epi16(base_ctx, 3);
576 base_ctx = _mm256_bsrli_epi128(base_ctx, 8);
Joe Young19695f32024-08-16 15:12:02 -0700577 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
578 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
579 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
580 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
581 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
582 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
583 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
584 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
585 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
586 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
587 }
588
589 // Calc coeff/eob cost.
Joe Youngd3ef83a2024-08-23 11:49:06 -0700590 int eob_ctx = coeff_ctx->coef_eob;
Joe Young19695f32024-08-16 15:12:02 -0700591 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
592 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
593 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
594 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
595 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
596
597 // Calc coeff mid and high range cost.
598 if (idx > 0) {
Joe Young3db806d2024-09-17 09:47:11 -0700599 for (int i = 0; i < n_states; i++) {
Joe Young19695f32024-08-16 15:12:02 -0700600 int a0 = i & 2 ? 1 : 0;
601 int a1 = a0 + 2;
Joe Young33564432024-08-23 15:21:54 -0700602 int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i],
603 txb_costs, 0, 0, 0);
604 int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i],
605 txb_costs, 0, 0, 0);
Joe Young19695f32024-08-16 15:12:02 -0700606 rd->rate[2 * i] += mid_cost0;
607 rd->rate[2 * i + 1] += mid_cost1;
608 }
Joe Youngee165d22024-08-19 10:57:07 -0700609 int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], 0, 0,
610 txb_costs, tx_class, 0, 0);
611 int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], 0, 0,
612 txb_costs, tx_class, 0, 0);
Joe Young19695f32024-08-16 15:12:02 -0700613 rd->rate_eob[0] += eob_mid_cost0;
614 rd->rate_eob[1] += eob_mid_cost1;
615 }
616}
617
Joe Young3db806d2024-09-17 09:47:11 -0700618void av1_get_rate_dist_def_luma_st4_avx2(
619 const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq,
620 const tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int bwl, TX_CLASS tx_class,
621 int diag_ctx, int eob_rate, int n_states, struct tcq_rate_t *rd) {
622 (void)bwl;
623 assert(n_states == 4);
624 n_states = 4;
625
626 const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_zero;
627 const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] =
628 txb_costs->base_cost_low_tbl;
629 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
630 txb_costs->base_eob_cost_tbl;
631 const tran_low_t *absLevel = pq->absLevel;
632
633 // Calc zero coeff costs.
634 __m256i zero = _mm256_setzero_si256();
635 __m256i cost_zero_dq0 =
636 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
637 __m256i cost_zero_dq1 =
638 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
639
640 __m256i coef_ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
641 __m256i ctx16 = _mm256_unpacklo_epi8(coef_ctx, zero);
642 __m256i ctx = _mm256_shuffle_epi32(ctx16, 0xD8);
643 __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx, zero);
644 __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx, zero);
645 __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0);
646 __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1);
647 __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1);
648 _mm_storeu_si128((__m128i *)&rd->rate_zero[0],
649 _mm256_castsi256_si128(ratez_0123));
650
651 // Calc coeff_base rate.
652 int idx = AOMMIN(pq->qIdx - 1, 4);
653 __m128i c_zero = _mm_setzero_si128();
654 __m256i diag = _mm256_set1_epi16(diag_ctx);
655 __m256i base_ctx = _mm256_slli_epi16(ctx16, 12);
656 base_ctx = _mm256_srli_epi16(base_ctx, 12);
657 base_ctx = _mm256_add_epi16(base_ctx, diag);
658 for (int i = 0; i < (n_states >> 2); i++) {
659 int ctx0 = _mm256_extract_epi16(base_ctx, 0);
660 int ctx1 = _mm256_extract_epi16(base_ctx, 1);
661 int ctx2 = _mm256_extract_epi16(base_ctx, 2);
662 int ctx3 = _mm256_extract_epi16(base_ctx, 3);
663 base_ctx = _mm256_bsrli_epi128(base_ctx, 8);
664 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
665 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
666 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
667 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
668 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
669 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
670 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
671 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
672 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
673 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
674 }
675
676 // Calc coeff/eob cost.
677 int eob_ctx = coeff_ctx->coef_eob;
678 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
679 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
680 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
681 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
682 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
683
684 // Calc coeff mid and high range cost.
685 if (idx > 0) {
686 for (int i = 0; i < n_states; i++) {
687 int a0 = i & 2 ? 1 : 0;
688 int a1 = a0 + 2;
689 int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i],
690 txb_costs, 0, 0, 0);
691 int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i],
692 txb_costs, 0, 0, 0);
693 rd->rate[2 * i] += mid_cost0;
694 rd->rate[2 * i + 1] += mid_cost1;
695 }
696 int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], 0, 0,
697 txb_costs, tx_class, 0, 0);
698 int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], 0, 0,
699 txb_costs, tx_class, 0, 0);
700 rd->rate_eob[0] += eob_mid_cost0;
701 rd->rate_eob[1] += eob_mid_cost1;
702 }
703}
704
705void av1_calc_lf_ctx_st4_avx2(const struct tcq_lf_ctx_t *lf_ctx, int scan_pos,
706 struct tcq_coeff_ctx_t *coeff_ctx) {
707 int n_states = 4;
Joe Youngaf03d882024-08-13 11:11:36 -0700708
709 int diag = kScanDiag[scan_pos];
710 __m256i zero = _mm256_setzero_si256();
711 __m256i nbr_mask = _mm256_lddqu_si256((__m256i *)kNbrMask[diag]);
712 __m256i base_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0);
713 __m256i mid_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0x11);
714
Joe Young3db806d2024-09-17 09:47:11 -0700715 for (int st = 0; st < n_states; st += 4) {
Joe Youngaf03d882024-08-13 11:11:36 -0700716 // Load previously decoded LF context values.
717 __m256i last01 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st]);
718 __m256i last23 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st + 2]);
719
720 // Calc base ctx neighbor sum.
721 __m256i base01 = _mm256_min_epu8(last01, base_mask);
722 __m256i base23 = _mm256_min_epu8(last23, base_mask);
723 __m256i base01_sum = _mm256_sad_epu8(base01, zero);
724 __m256i base23_sum = _mm256_sad_epu8(base23, zero);
725 __m256i base_sum =
726 _mm256_hadd_epi32(base01_sum, base23_sum); // B0 B0 B2 B2 B1 B1 B3 B3
727
728 // Calc mid ctx neighbor sum.
729 __m256i mid01 = _mm256_min_epu8(last01, mid_mask);
730 __m256i mid23 = _mm256_min_epu8(last23, mid_mask);
731 __m256i mid01_sum = _mm256_sad_epu8(mid01, zero);
732 __m256i mid23_sum = _mm256_sad_epu8(mid23, zero);
733 __m256i mid_sum =
734 _mm256_hadd_epi32(mid01_sum, mid23_sum); // M0 M0 M2 M2 M1 M1 M3 M3
735
736 // Context calc; combine and reduce to 8 bits.
737 __m256i base_mid =
738 _mm256_hadd_epi32(base_sum, mid_sum); // B0B2 M0M2 B1B3 M1M3
739 base_mid = _mm256_hadd_epi16(
740 base_mid, zero); // reduce to 16 bits B0B2 M0M2 - - B1B3 M1M3 - -
741 base_mid = _mm256_avg_epu16(base_mid, zero); // x = (x + 1) >> 1
742 base_mid = _mm256_shufflelo_epi16(
743 base_mid, 0xD8); // shuffle B0M0 B2M2 - - B1M1 B3M3 - -
744 base_mid = _mm256_permute4x64_epi64(
745 base_mid, 0xD8); // pack into lower half: B0M0 B2M2 B1M1 B3M3
746 base_mid = _mm256_shuffle_epi32(base_mid, 0xD8); // B0M0 B1M1 B2M2 B3M3
747 __m256i six = _mm256_set1_epi16(6);
748 __m256i mid = _mm256_min_epi16(base_mid, six);
749 __m256i mid_sh4 = _mm256_slli_epi16(mid, 4);
750 __m256i base_max = _mm256_set1_epi16(kMaxCtx[scan_pos]);
751 __m256i base = _mm256_min_epi16(base_mid, base_max);
752 base_mid = _mm256_blend_epi16(base, mid_sh4, 0xAA);
753 __m256i ctx16 = _mm256_hadd_epi16(base_mid, base_mid);
754 __m256i mid_ctx_offset = _mm256_set1_epi16((scan_pos == 0) ? 0 : (7 << 4));
755 ctx16 = _mm256_add_epi16(ctx16, mid_ctx_offset);
756 __m128i ctx8 = _mm256_castsi256_si128(ctx16);
757 ctx8 = _mm_packus_epi16(ctx8, ctx8);
Joe Young3db806d2024-09-17 09:47:11 -0700758#if 1
759 // Older compilers don't implement _mm_storeu_si32()
760 _mm_store_ss((float *)&coeff_ctx->coef[st], _mm_castsi128_ps(ctx8));
761#else
762 _mm_storeu_si32(&coeff_ctx->coef[st], ctx8);
763#endif
Joe Youngaf03d882024-08-13 11:11:36 -0700764 }
765}
766
Joe Young3db806d2024-09-17 09:47:11 -0700767void av1_calc_lf_ctx_st8_avx2(const struct tcq_lf_ctx_t *lf_ctx, int scan_pos,
768 struct tcq_coeff_ctx_t *coeff_ctx) {
769 int n_states = 8;
770
771 int diag = kScanDiag[scan_pos];
772 __m256i zero = _mm256_setzero_si256();
773 __m256i nbr_mask = _mm256_lddqu_si256((__m256i *)kNbrMask[diag]);
774 __m256i base_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0);
775 __m256i mid_mask = _mm256_permute2x128_si256(nbr_mask, nbr_mask, 0x11);
776
777 for (int st = 0; st < n_states; st += 4) {
778 // Load previously decoded LF context values.
779 __m256i last01 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st]);
780 __m256i last23 = _mm256_lddqu_si256((__m256i *)&lf_ctx[st + 2]);
781
782 // Calc base ctx neighbor sum.
783 __m256i base01 = _mm256_min_epu8(last01, base_mask);
784 __m256i base23 = _mm256_min_epu8(last23, base_mask);
785 __m256i base01_sum = _mm256_sad_epu8(base01, zero);
786 __m256i base23_sum = _mm256_sad_epu8(base23, zero);
787 __m256i base_sum =
788 _mm256_hadd_epi32(base01_sum, base23_sum); // B0 B0 B2 B2 B1 B1 B3 B3
789
790 // Calc mid ctx neighbor sum.
791 __m256i mid01 = _mm256_min_epu8(last01, mid_mask);
792 __m256i mid23 = _mm256_min_epu8(last23, mid_mask);
793 __m256i mid01_sum = _mm256_sad_epu8(mid01, zero);
794 __m256i mid23_sum = _mm256_sad_epu8(mid23, zero);
795 __m256i mid_sum =
796 _mm256_hadd_epi32(mid01_sum, mid23_sum); // M0 M0 M2 M2 M1 M1 M3 M3
797
798 // Context calc; combine and reduce to 8 bits.
799 __m256i base_mid =
800 _mm256_hadd_epi32(base_sum, mid_sum); // B0B2 M0M2 B1B3 M1M3
801 base_mid = _mm256_hadd_epi16(
802 base_mid, zero); // reduce to 16 bits B0B2 M0M2 - - B1B3 M1M3 - -
803 base_mid = _mm256_avg_epu16(base_mid, zero); // x = (x + 1) >> 1
804 base_mid = _mm256_shufflelo_epi16(
805 base_mid, 0xD8); // shuffle B0M0 B2M2 - - B1M1 B3M3 - -
806 base_mid = _mm256_permute4x64_epi64(
807 base_mid, 0xD8); // pack into lower half: B0M0 B2M2 B1M1 B3M3
808 base_mid = _mm256_shuffle_epi32(base_mid, 0xD8); // B0M0 B1M1 B2M2 B3M3
809 __m256i six = _mm256_set1_epi16(6);
810 __m256i mid = _mm256_min_epi16(base_mid, six);
811 __m256i mid_sh4 = _mm256_slli_epi16(mid, 4);
812 __m256i base_max = _mm256_set1_epi16(kMaxCtx[scan_pos]);
813 __m256i base = _mm256_min_epi16(base_mid, base_max);
814 base_mid = _mm256_blend_epi16(base, mid_sh4, 0xAA);
815 __m256i ctx16 = _mm256_hadd_epi16(base_mid, base_mid);
816 __m256i mid_ctx_offset = _mm256_set1_epi16((scan_pos == 0) ? 0 : (7 << 4));
817 ctx16 = _mm256_add_epi16(ctx16, mid_ctx_offset);
818 __m128i ctx8 = _mm256_castsi256_si128(ctx16);
819 ctx8 = _mm_packus_epi16(ctx8, ctx8);
820#if 1
821 // Older compilers don't implement _mm_storeu_si32()
822 _mm_store_ss((float *)&coeff_ctx->coef[st], _mm_castsi128_ps(ctx8));
823#else
824 _mm_storeu_si32(&coeff_ctx->coef[st], ctx8);
825#endif
826 }
827}
828
829void av1_update_lf_ctx_avx2(const struct tcq_node_t *decision, int n_states,
Joe Youngaf03d882024-08-13 11:11:36 -0700830 struct tcq_lf_ctx_t *lf_ctx) {
Joe Young3db806d2024-09-17 09:47:11 -0700831 __m256i c_zero = _mm256_setzero_si256();
832 __m256i upd_last_a = c_zero;
833 __m256i upd_last_b = c_zero;
834 __m256i upd_last_c = c_zero;
835 __m256i upd_last_d = c_zero;
836
837 for (int st = 0; st < n_states; st += 2) {
Joe Youngaf03d882024-08-13 11:11:36 -0700838 int absLevel0 = decision[st].absLevel;
839 int prevId0 = decision[st].prevId;
840 int absLevel1 = decision[st + 1].absLevel;
841 int prevId1 = decision[st + 1].prevId;
842 __m128i upd0 = _mm_setzero_si128();
843 __m128i upd1 = _mm_setzero_si128();
844 if (prevId0 >= 0) {
845 upd0 = _mm_lddqu_si128((__m128i *)lf_ctx[prevId0].last);
846 }
847 if (prevId1 >= 0) {
848 upd1 = _mm_lddqu_si128((__m128i *)lf_ctx[prevId1].last);
849 }
850 upd0 = _mm_slli_si128(upd0, 1);
851 upd1 = _mm_slli_si128(upd1, 1);
852 upd0 = _mm_insert_epi8(upd0, AOMMIN(absLevel0, INT8_MAX), 0);
853 upd1 = _mm_insert_epi8(upd1, AOMMIN(absLevel1, INT8_MAX), 0);
854 __m256i upd01 = _mm256_castsi128_si256(upd0);
855 upd01 = _mm256_inserti128_si256(upd01, upd1, 1);
Joe Youngaf03d882024-08-13 11:11:36 -0700856 upd_last_d = upd_last_c;
857 upd_last_c = upd_last_b;
Joe Youngaf03d882024-08-13 11:11:36 -0700858 upd_last_b = upd_last_a;
859 upd_last_a = upd01;
860 }
Joe Young3db806d2024-09-17 09:47:11 -0700861 if (n_states == 4) {
862 (void)upd_last_d;
863 (void)upd_last_c;
864 _mm256_storeu_si256((__m256i *)lf_ctx[0].last, upd_last_b);
865 _mm256_storeu_si256((__m256i *)lf_ctx[2].last, upd_last_a);
866 } else {
867 _mm256_storeu_si256((__m256i *)lf_ctx[0].last, upd_last_d);
868 _mm256_storeu_si256((__m256i *)lf_ctx[2].last, upd_last_c);
869 _mm256_storeu_si256((__m256i *)lf_ctx[4].last, upd_last_b);
870 _mm256_storeu_si256((__m256i *)lf_ctx[6].last, upd_last_a);
871 }
Joe Youngaf03d882024-08-13 11:11:36 -0700872}
873
Joe Young19695f32024-08-16 15:12:02 -0700874void av1_get_rate_dist_lf_luma_avx2(const struct LV_MAP_COEFF_COST *txb_costs,
875 const struct prequant_t *pq,
Joe Youngd3ef83a2024-08-23 11:49:06 -0700876 const struct tcq_coeff_ctx_t *coeff_ctx,
Joe Youngee165d22024-08-19 10:57:07 -0700877 int blk_pos, int diag_ctx, int eob_rate,
Joe Young3db806d2024-09-17 09:47:11 -0700878 int dc_sign_ctx, const int32_t *tmp_sign,
879 int bwl, TX_CLASS tx_class, int coeff_sign,
880 int n_states, struct tcq_rate_t *rd) {
Joe Younge5046fe2024-08-13 11:11:36 -0700881#define Z -1
882 static const int8_t kShuf[2][32] = {
883 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
884 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
885 { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z,
886 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z }
887 };
Joe Youngfb9653f2024-08-28 08:35:13 -0700888 const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] =
889 txb_costs->base_lf_cost_zero;
Joe Younge5046fe2024-08-13 11:11:36 -0700890 const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] =
891 txb_costs->base_lf_cost_low_tbl;
Joe Young19695f32024-08-16 15:12:02 -0700892 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
893 txb_costs->base_lf_eob_cost_tbl;
Joe Younge5046fe2024-08-13 11:11:36 -0700894 const tran_low_t *absLevel = pq->absLevel;
Joe Younge5046fe2024-08-13 11:11:36 -0700895 const int plane = 0;
896
Joe Younge5046fe2024-08-13 11:11:36 -0700897 // Calc zero coeff costs.
898 __m256i cost_zero_dq0 =
Joe Youngfb9653f2024-08-28 08:35:13 -0700899 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
Joe Younge5046fe2024-08-13 11:11:36 -0700900 __m256i cost_zero_dq1 =
Joe Youngfb9653f2024-08-28 08:35:13 -0700901 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
Joe Younge5046fe2024-08-13 11:11:36 -0700902 __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]);
903 cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf);
904 cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf);
905 __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8);
906 __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8);
Joe Youngd3ef83a2024-08-23 11:49:06 -0700907 __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
Joe Younge5046fe2024-08-13 11:11:36 -0700908 __m256i fifteen = _mm256_set1_epi8(15);
909 __m256i base_ctx = _mm256_and_si256(ctx, fifteen);
Joe Youngd3ef83a2024-08-23 11:49:06 -0700910 __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0);
911 __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1);
912 __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1);
Joe Younge5046fe2024-08-13 11:11:36 -0700913 __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA);
914 ratez = _mm256_permute4x64_epi64(ratez, 0x88);
915 __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]);
916 ratez = _mm256_shuffle_epi8(ratez, shuf1);
Joe Young19695f32024-08-16 15:12:02 -0700917 _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez);
Joe Younge5046fe2024-08-13 11:11:36 -0700918
919 // Calc coeff_base rate.
920 int idx = AOMMIN(pq->qIdx - 1, 8);
Joe Young19695f32024-08-16 15:12:02 -0700921 __m128i c_zero = _mm_setzero_si128();
Joe Youngd3ef83a2024-08-23 11:49:06 -0700922 __m256i diag = _mm256_set1_epi8(diag_ctx);
923 base_ctx = _mm256_add_epi8(base_ctx, diag);
Joe Young3db806d2024-09-17 09:47:11 -0700924 for (int i = 0; i < (n_states >> 2); i++) {
Joe Youngd3ef83a2024-08-23 11:49:06 -0700925 int ctx0 = _mm256_extract_epi8(base_ctx, 0);
926 int ctx1 = _mm256_extract_epi8(base_ctx, 1);
927 int ctx2 = _mm256_extract_epi8(base_ctx, 2);
928 int ctx3 = _mm256_extract_epi8(base_ctx, 3);
929 base_ctx = _mm256_bsrli_epi128(base_ctx, 4);
Joe Younge5046fe2024-08-13 11:11:36 -0700930 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
931 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
932 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
933 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
934 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
935 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
Joe Younge5046fe2024-08-13 11:11:36 -0700936 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
937 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
Joe Young19695f32024-08-16 15:12:02 -0700938 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
939 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
Joe Younge5046fe2024-08-13 11:11:36 -0700940 }
941
Joe Young19695f32024-08-16 15:12:02 -0700942 // Calc coeff/eob cost.
Joe Youngd3ef83a2024-08-23 11:49:06 -0700943 int eob_ctx = coeff_ctx->coef_eob;
Joe Young19695f32024-08-16 15:12:02 -0700944 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
945 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
946 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
947 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
948 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
949
Joe Younge5046fe2024-08-13 11:11:36 -0700950 const int row = blk_pos >> bwl;
951 const int col = blk_pos - (row << bwl);
952 const bool dc_2dtx = (blk_pos == 0);
953 const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ;
954 const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT;
955 const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver;
956 if (is_dc_coeff) {
Joe Young3db806d2024-09-17 09:47:11 -0700957 for (int i = 0; i < n_states; i++) {
Joe Younge5046fe2024-08-13 11:11:36 -0700958 int a0 = i & 2 ? 1 : 0;
959 int a1 = a0 + 2;
Joe Young33564432024-08-23 15:21:54 -0700960 int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign,
961 coeff_ctx->coef[i], dc_sign_ctx,
962 txb_costs, tmp_sign, plane);
963 int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign,
964 coeff_ctx->coef[i], dc_sign_ctx,
965 txb_costs, tmp_sign, plane);
Joe Young19695f32024-08-16 15:12:02 -0700966 rd->rate[2 * i] += mid_cost0;
967 rd->rate[2 * i + 1] += mid_cost1;
Joe Younge5046fe2024-08-13 11:11:36 -0700968 }
Joe Young19695f32024-08-16 15:12:02 -0700969 int t_sign = tmp_sign[blk_pos];
Joe Youngee165d22024-08-19 10:57:07 -0700970 int eob_mid_cost0 =
971 get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx,
972 txb_costs, tx_class, t_sign, 0);
973 int eob_mid_cost1 =
974 get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx,
975 txb_costs, tx_class, t_sign, 0);
Joe Young19695f32024-08-16 15:12:02 -0700976 rd->rate_eob[0] += eob_mid_cost0;
977 rd->rate_eob[1] += eob_mid_cost1;
Joe Younge5046fe2024-08-13 11:11:36 -0700978 } else if (idx > 4) {
Joe Young3db806d2024-09-17 09:47:11 -0700979 for (int i = 0; i < n_states; i++) {
980 int a0 = i & 2 ? 1 : 0;
981 int a1 = a0 + 2;
982 int mid_cost0 =
983 get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane);
984 int mid_cost1 =
985 get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane);
986 rd->rate[2 * i] += mid_cost0;
987 rd->rate[2 * i + 1] += mid_cost1;
988 }
989 int t_sign = tmp_sign[blk_pos];
990 int eob_mid_cost0 =
991 get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx,
992 txb_costs, tx_class, t_sign, 0);
993 int eob_mid_cost1 =
994 get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx,
995 txb_costs, tx_class, t_sign, 0);
996 rd->rate_eob[0] += eob_mid_cost0;
997 rd->rate_eob[1] += eob_mid_cost1;
998 }
999}
1000
1001void av1_get_rate_dist_lf_luma_st4_avx2(
1002 const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq,
1003 const struct tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int diag_ctx,
1004 int eob_rate, int dc_sign_ctx, const int32_t *tmp_sign, int bwl,
1005 TX_CLASS tx_class, int coeff_sign, int n_states, struct tcq_rate_t *rd) {
1006 assert(n_states == 4);
1007 n_states = 4;
1008#define Z -1
1009 static const int8_t kShuf[2][32] = {
1010 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
1011 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
1012 { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z,
1013 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z }
1014 };
1015 const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] =
1016 txb_costs->base_lf_cost_zero;
1017 const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] =
1018 txb_costs->base_lf_cost_low_tbl;
1019 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
1020 txb_costs->base_lf_eob_cost_tbl;
1021 const tran_low_t *absLevel = pq->absLevel;
1022 const int plane = 0;
1023
1024 // Calc zero coeff costs.
1025 __m256i cost_zero_dq0 =
1026 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
1027 __m256i cost_zero_dq1 =
1028 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
1029 __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]);
1030 cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf);
1031 cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf);
1032 __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8);
1033 __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8);
1034 __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
1035 __m256i fifteen = _mm256_set1_epi8(15);
1036 __m256i base_ctx = _mm256_and_si256(ctx, fifteen);
1037 __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0);
1038 __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1);
1039 __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1);
1040 __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA);
1041 ratez = _mm256_permute4x64_epi64(ratez, 0x88);
1042 __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]);
1043 ratez = _mm256_shuffle_epi8(ratez, shuf1);
1044 _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez);
1045
1046 // Calc coeff_base rate.
1047 int idx = AOMMIN(pq->qIdx - 1, 8);
1048 __m128i c_zero = _mm_setzero_si128();
1049 __m256i diag = _mm256_set1_epi8(diag_ctx);
1050 base_ctx = _mm256_add_epi8(base_ctx, diag);
1051 for (int i = 0; i < (n_states >> 2); i++) {
1052 int ctx0 = _mm256_extract_epi8(base_ctx, 0);
1053 int ctx1 = _mm256_extract_epi8(base_ctx, 1);
1054 int ctx2 = _mm256_extract_epi8(base_ctx, 2);
1055 int ctx3 = _mm256_extract_epi8(base_ctx, 3);
1056 base_ctx = _mm256_bsrli_epi128(base_ctx, 4);
1057 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
1058 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
1059 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
1060 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
1061 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
1062 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
1063 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
1064 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
1065 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
1066 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
1067 }
1068
1069 // Calc coeff/eob cost.
1070 int eob_ctx = coeff_ctx->coef_eob;
1071 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
1072 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
1073 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
1074 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
1075 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
1076
1077 const int row = blk_pos >> bwl;
1078 const int col = blk_pos - (row << bwl);
1079 const bool dc_2dtx = (blk_pos == 0);
1080 const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ;
1081 const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT;
1082 const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver;
1083 if (is_dc_coeff) {
1084 for (int i = 0; i < n_states; i++) {
1085 int a0 = i & 2 ? 1 : 0;
1086 int a1 = a0 + 2;
1087 int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign,
1088 coeff_ctx->coef[i], dc_sign_ctx,
1089 txb_costs, tmp_sign, plane);
1090 int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign,
1091 coeff_ctx->coef[i], dc_sign_ctx,
1092 txb_costs, tmp_sign, plane);
1093 rd->rate[2 * i] += mid_cost0;
1094 rd->rate[2 * i + 1] += mid_cost1;
1095 }
1096 int t_sign = tmp_sign[blk_pos];
1097 int eob_mid_cost0 =
1098 get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx,
1099 txb_costs, tx_class, t_sign, 0);
1100 int eob_mid_cost1 =
1101 get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx,
1102 txb_costs, tx_class, t_sign, 0);
1103 rd->rate_eob[0] += eob_mid_cost0;
1104 rd->rate_eob[1] += eob_mid_cost1;
1105 } else if (idx > 4) {
1106 for (int i = 0; i < n_states; i++) {
Joe Younge5046fe2024-08-13 11:11:36 -07001107 int a0 = i & 2 ? 1 : 0;
1108 int a1 = a0 + 2;
1109 int mid_cost0 =
Joe Youngd3ef83a2024-08-23 11:49:06 -07001110 get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane);
Joe Younge5046fe2024-08-13 11:11:36 -07001111 int mid_cost1 =
Joe Youngd3ef83a2024-08-23 11:49:06 -07001112 get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane);
Joe Young19695f32024-08-16 15:12:02 -07001113 rd->rate[2 * i] += mid_cost0;
1114 rd->rate[2 * i + 1] += mid_cost1;
Joe Younge5046fe2024-08-13 11:11:36 -07001115 }
Joe Young19695f32024-08-16 15:12:02 -07001116 int t_sign = tmp_sign[blk_pos];
Joe Youngee165d22024-08-19 10:57:07 -07001117 int eob_mid_cost0 =
1118 get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx,
1119 txb_costs, tx_class, t_sign, 0);
1120 int eob_mid_cost1 =
1121 get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx,
1122 txb_costs, tx_class, t_sign, 0);
Joe Young19695f32024-08-16 15:12:02 -07001123 rd->rate_eob[0] += eob_mid_cost0;
1124 rd->rate_eob[1] += eob_mid_cost1;
Joe Younge5046fe2024-08-13 11:11:36 -07001125 }
1126}
1127
Joe Young19695f32024-08-16 15:12:02 -07001128void av1_get_rate_dist_lf_chroma_avx2(const struct LV_MAP_COEFF_COST *txb_costs,
1129 const struct prequant_t *pq,
Joe Youngd3ef83a2024-08-23 11:49:06 -07001130 const struct tcq_coeff_ctx_t *coeff_ctx,
Joe Youngee165d22024-08-19 10:57:07 -07001131 int blk_pos, int diag_ctx, int eob_rate,
Joe Young3db806d2024-09-17 09:47:11 -07001132 int dc_sign_ctx, const int32_t *tmp_sign,
Joe Young19695f32024-08-16 15:12:02 -07001133 int bwl, TX_CLASS tx_class, int plane,
Joe Young3db806d2024-09-17 09:47:11 -07001134 int coeff_sign, int n_states,
1135 struct tcq_rate_t *rd) {
Joe Youngee165d22024-08-19 10:57:07 -07001136 (void)bwl;
Joe Youngbdde8682024-08-07 09:33:48 -07001137#define Z -1
1138 static const int8_t kShuf[2][32] = {
1139 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
1140 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
1141 { 0, 8, Z, Z, 1, 9, Z, Z, 2, 10, Z, Z, 3, 11, Z, Z,
1142 4, 12, Z, Z, 5, 13, Z, Z, 6, 14, Z, Z, 7, 15, Z, Z }
1143 };
Joe Youngfb9653f2024-08-28 08:35:13 -07001144 const uint16_t(*cost_zero)[LF_SIG_COEF_CONTEXTS] =
1145 plane ? txb_costs->base_lf_cost_uv_zero : txb_costs->base_lf_cost_zero;
Joe Youngbdde8682024-08-07 09:33:48 -07001146 const uint16_t(*cost_low_tbl)[LF_SIG_COEF_CONTEXTS][DQ_CTXS][2] =
1147 plane ? txb_costs->base_lf_cost_uv_low_tbl
1148 : txb_costs->base_lf_cost_low_tbl;
Joe Young19695f32024-08-16 15:12:02 -07001149 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
1150 txb_costs->base_lf_eob_cost_uv_tbl;
Joe Youngbdde8682024-08-07 09:33:48 -07001151 const tran_low_t *absLevel = pq->absLevel;
Joe Youngbdde8682024-08-07 09:33:48 -07001152
1153 // Calc zero coeff costs.
1154 __m256i cost_zero_dq0 =
Joe Youngfb9653f2024-08-28 08:35:13 -07001155 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -07001156 __m256i cost_zero_dq1 =
Joe Youngfb9653f2024-08-28 08:35:13 -07001157 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -07001158 __m256i shuf = _mm256_lddqu_si256((__m256i *)kShuf[0]);
1159 cost_zero_dq0 = _mm256_shuffle_epi8(cost_zero_dq0, shuf);
1160 cost_zero_dq1 = _mm256_shuffle_epi8(cost_zero_dq1, shuf);
1161 __m256i cost_dq0 = _mm256_permute4x64_epi64(cost_zero_dq0, 0xD8);
1162 __m256i cost_dq1 = _mm256_permute4x64_epi64(cost_zero_dq1, 0xD8);
Joe Youngd3ef83a2024-08-23 11:49:06 -07001163 __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
Joe Youngbdde8682024-08-07 09:33:48 -07001164 __m256i fifteen = _mm256_set1_epi8(15);
1165 __m256i base_ctx = _mm256_and_si256(ctx, fifteen);
Joe Youngd3ef83a2024-08-23 11:49:06 -07001166 __m256i base_ctx1 = _mm256_permute4x64_epi64(base_ctx, 0);
1167 __m256i ratez_dq0 = _mm256_shuffle_epi8(cost_dq0, base_ctx1);
1168 __m256i ratez_dq1 = _mm256_shuffle_epi8(cost_dq1, base_ctx1);
Joe Youngbdde8682024-08-07 09:33:48 -07001169 __m256i ratez = _mm256_blend_epi16(ratez_dq0, ratez_dq1, 0xAA);
1170 ratez = _mm256_permute4x64_epi64(ratez, 0x88);
1171 __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]);
1172 ratez = _mm256_shuffle_epi8(ratez, shuf1);
Joe Young19695f32024-08-16 15:12:02 -07001173 _mm256_storeu_si256((__m256i *)&rd->rate_zero[0], ratez);
Joe Youngbdde8682024-08-07 09:33:48 -07001174
1175 // Calc coeff_base rate.
1176 int idx = AOMMIN(pq->qIdx - 1, 8);
Joe Young19695f32024-08-16 15:12:02 -07001177 __m128i c_zero = _mm_setzero_si128();
Joe Youngd3ef83a2024-08-23 11:49:06 -07001178 __m256i diag = _mm256_set1_epi8(diag_ctx);
1179 base_ctx = _mm256_add_epi8(base_ctx, diag);
Joe Young3db806d2024-09-17 09:47:11 -07001180 for (int i = 0; i < (n_states >> 2); i++) {
Joe Youngd3ef83a2024-08-23 11:49:06 -07001181 int ctx0 = _mm256_extract_epi8(base_ctx, 0);
1182 int ctx1 = _mm256_extract_epi8(base_ctx, 1);
1183 int ctx2 = _mm256_extract_epi8(base_ctx, 2);
1184 int ctx3 = _mm256_extract_epi8(base_ctx, 3);
1185 base_ctx = _mm256_bsrli_epi128(base_ctx, 4);
Joe Youngbdde8682024-08-07 09:33:48 -07001186 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
1187 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
1188 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
1189 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
1190 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
1191 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
Joe Youngbdde8682024-08-07 09:33:48 -07001192 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
1193 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
Joe Young19695f32024-08-16 15:12:02 -07001194 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
1195 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
Joe Youngbdde8682024-08-07 09:33:48 -07001196 }
1197
Joe Young19695f32024-08-16 15:12:02 -07001198 // Calc coeff/eob cost.
Joe Youngd3ef83a2024-08-23 11:49:06 -07001199 int eob_ctx = coeff_ctx->coef_eob;
Joe Young19695f32024-08-16 15:12:02 -07001200 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
1201 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
1202 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
1203 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
1204 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
1205
1206 // Chroma LF region consists of only DC coeffs.
1207#if 1
1208 const int is_dc_coeff = 1;
1209#else
Joe Youngbdde8682024-08-07 09:33:48 -07001210 const int row = blk_pos >> bwl;
1211 const int col = blk_pos - (row << bwl);
1212 const bool dc_2dtx = (blk_pos == 0);
1213 const bool dc_hor = (col == 0) && tx_class == TX_CLASS_HORIZ;
1214 const bool dc_ver = (row == 0) && tx_class == TX_CLASS_VERT;
1215 const bool is_dc_coeff = dc_2dtx || dc_hor || dc_ver;
Joe Young19695f32024-08-16 15:12:02 -07001216#endif
Joe Youngbdde8682024-08-07 09:33:48 -07001217 if (is_dc_coeff) {
Joe Young3db806d2024-09-17 09:47:11 -07001218 for (int i = 0; i < n_states; i++) {
Joe Youngbdde8682024-08-07 09:33:48 -07001219 int a0 = i & 2 ? 1 : 0;
1220 int a1 = a0 + 2;
Joe Young33564432024-08-23 15:21:54 -07001221 int mid_cost0 = get_mid_cost_lf_dc(blk_pos, absLevel[a0], coeff_sign,
1222 coeff_ctx->coef[i], dc_sign_ctx,
1223 txb_costs, tmp_sign, plane);
1224 int mid_cost1 = get_mid_cost_lf_dc(blk_pos, absLevel[a1], coeff_sign,
1225 coeff_ctx->coef[i], dc_sign_ctx,
1226 txb_costs, tmp_sign, plane);
Joe Young19695f32024-08-16 15:12:02 -07001227 rd->rate[2 * i] += mid_cost0;
1228 rd->rate[2 * i + 1] += mid_cost1;
Joe Youngbdde8682024-08-07 09:33:48 -07001229 }
Joe Young19695f32024-08-16 15:12:02 -07001230 int t_sign = tmp_sign[blk_pos];
Joe Youngee165d22024-08-19 10:57:07 -07001231 int eob_mid_cost0 =
1232 get_mid_cost_eob(blk_pos, 1, 1, absLevel[0], coeff_sign, dc_sign_ctx,
1233 txb_costs, tx_class, t_sign, plane);
1234 int eob_mid_cost1 =
1235 get_mid_cost_eob(blk_pos, 1, 1, absLevel[2], coeff_sign, dc_sign_ctx,
1236 txb_costs, tx_class, t_sign, plane);
Joe Young19695f32024-08-16 15:12:02 -07001237 rd->rate_eob[0] += eob_mid_cost0;
1238 rd->rate_eob[1] += eob_mid_cost1;
Joe Youngbdde8682024-08-07 09:33:48 -07001239 } else if (idx > 4) {
Joe Young3db806d2024-09-17 09:47:11 -07001240 for (int i = 0; i < n_states; i++) {
Joe Youngbdde8682024-08-07 09:33:48 -07001241 int a0 = i & 2 ? 1 : 0;
1242 int a1 = a0 + 2;
1243 int mid_cost0 =
Joe Youngd3ef83a2024-08-23 11:49:06 -07001244 get_mid_cost_lf(absLevel[a0], coeff_ctx->coef[i], txb_costs, plane);
Joe Youngbdde8682024-08-07 09:33:48 -07001245 int mid_cost1 =
Joe Youngd3ef83a2024-08-23 11:49:06 -07001246 get_mid_cost_lf(absLevel[a1], coeff_ctx->coef[i], txb_costs, plane);
Joe Young19695f32024-08-16 15:12:02 -07001247 rd->rate[2 * i] += mid_cost0;
1248 rd->rate[2 * i + 1] += mid_cost1;
Joe Youngbdde8682024-08-07 09:33:48 -07001249 }
Joe Young19695f32024-08-16 15:12:02 -07001250 int t_sign = tmp_sign[blk_pos];
Joe Youngee165d22024-08-19 10:57:07 -07001251 int eob_mid_cost0 =
1252 get_mid_cost_eob(blk_pos, 1, 0, absLevel[0], coeff_sign, dc_sign_ctx,
1253 txb_costs, tx_class, t_sign, plane);
1254 int eob_mid_cost1 =
1255 get_mid_cost_eob(blk_pos, 1, 0, absLevel[2], coeff_sign, dc_sign_ctx,
1256 txb_costs, tx_class, t_sign, plane);
Joe Young19695f32024-08-16 15:12:02 -07001257 rd->rate_eob[0] += eob_mid_cost0;
1258 rd->rate_eob[1] += eob_mid_cost1;
Joe Young463ba7f2024-06-25 14:27:02 -07001259 }
1260}
1261
Joe Youngee165d22024-08-19 10:57:07 -07001262void av1_get_rate_dist_def_chroma_avx2(
1263 const struct LV_MAP_COEFF_COST *txb_costs, const struct prequant_t *pq,
Joe Youngd3ef83a2024-08-23 11:49:06 -07001264 const struct tcq_coeff_ctx_t *coeff_ctx, int blk_pos, int bwl,
Joe Youngee165d22024-08-19 10:57:07 -07001265 TX_CLASS tx_class, int diag_ctx, int eob_rate, int plane, int t_sign,
Joe Young3db806d2024-09-17 09:47:11 -07001266 int sign, int n_states, struct tcq_rate_t *rd) {
Joe Youngee165d22024-08-19 10:57:07 -07001267 (void)bwl;
Joe Youngfb9653f2024-08-28 08:35:13 -07001268 const int32_t(*cost_zero)[SIG_COEF_CONTEXTS] = txb_costs->base_cost_uv_zero;
Joe Youngbdde8682024-08-07 09:33:48 -07001269 const uint16_t(*cost_low_tbl)[SIG_COEF_CONTEXTS][DQ_CTXS][2] =
1270 txb_costs->base_cost_uv_low_tbl;
Joe Young19695f32024-08-16 15:12:02 -07001271 const uint16_t(*cost_eob_tbl)[SIG_COEF_CONTEXTS_EOB][2] =
1272 txb_costs->base_eob_cost_uv_tbl;
Joe Youngbdde8682024-08-07 09:33:48 -07001273 const tran_low_t *absLevel = pq->absLevel;
Joe Youngbdde8682024-08-07 09:33:48 -07001274
1275 // Calc zero coeff costs.
1276 __m256i zero = _mm256_setzero_si256();
1277 __m256i cost_zero_dq0 =
Joe Youngfb9653f2024-08-28 08:35:13 -07001278 _mm256_lddqu_si256((__m256i *)&cost_zero[0][diag_ctx]);
Joe Youngbdde8682024-08-07 09:33:48 -07001279 __m256i cost_zero_dq1 =
Joe Youngfb9653f2024-08-28 08:35:13 -07001280 _mm256_lddqu_si256((__m256i *)&cost_zero[1][diag_ctx]);
Joe Youngd3ef83a2024-08-23 11:49:06 -07001281 __m256i ctx = _mm256_castsi128_si256(_mm_loadu_si64(&coeff_ctx->coef));
1282 __m256i ctx16 = _mm256_unpacklo_epi8(ctx, zero);
1283 __m256i ctx16sh = _mm256_shuffle_epi32(ctx16, 0xD8);
1284 __m256i ctx_dq0 = _mm256_unpacklo_epi16(ctx16sh, zero);
1285 __m256i ctx_dq1 = _mm256_unpackhi_epi16(ctx16sh, zero);
Joe Youngbdde8682024-08-07 09:33:48 -07001286 __m256i ratez_dq0 = _mm256_permutevar8x32_epi32(cost_zero_dq0, ctx_dq0);
1287 __m256i ratez_dq1 = _mm256_permutevar8x32_epi32(cost_zero_dq1, ctx_dq1);
1288 __m256i ratez_0123 = _mm256_unpacklo_epi64(ratez_dq0, ratez_dq1);
Joe Young19695f32024-08-16 15:12:02 -07001289 _mm_storeu_si128((__m128i *)&rd->rate_zero[0],
Joe Youngbdde8682024-08-07 09:33:48 -07001290 _mm256_castsi256_si128(ratez_0123));
Joe Youngbdde8682024-08-07 09:33:48 -07001291 __m256i ratez_4567 = _mm256_unpackhi_epi64(ratez_dq0, ratez_dq1);
Joe Young19695f32024-08-16 15:12:02 -07001292 _mm_storeu_si128((__m128i *)&rd->rate_zero[4],
Joe Youngbdde8682024-08-07 09:33:48 -07001293 _mm256_castsi256_si128(ratez_4567));
Joe Youngbdde8682024-08-07 09:33:48 -07001294
1295 // Calc coeff_base rate.
1296 int idx = AOMMIN(pq->qIdx - 1, 4);
Joe Young19695f32024-08-16 15:12:02 -07001297 __m128i c_zero = _mm_setzero_si128();
Joe Youngd3ef83a2024-08-23 11:49:06 -07001298 __m256i diag = _mm256_set1_epi16(diag_ctx);
1299 __m256i base_ctx = _mm256_slli_epi16(ctx16, 12);
1300 base_ctx = _mm256_srli_epi16(base_ctx, 12);
1301 base_ctx = _mm256_add_epi16(base_ctx, diag);
Joe Young3db806d2024-09-17 09:47:11 -07001302 for (int i = 0; i < (n_states >> 2); i++) {
Joe Youngd3ef83a2024-08-23 11:49:06 -07001303 int ctx0 = _mm256_extract_epi16(base_ctx, 0);
1304 int ctx1 = _mm256_extract_epi16(base_ctx, 1);
1305 int ctx2 = _mm256_extract_epi16(base_ctx, 2);
1306 int ctx3 = _mm256_extract_epi16(base_ctx, 3);
1307 base_ctx = _mm256_bsrli_epi128(base_ctx, 8);
Joe Youngbdde8682024-08-07 09:33:48 -07001308 __m128i rate_01 = _mm_loadu_si64(&cost_low_tbl[idx][ctx0][0]);
1309 __m128i rate_23 = _mm_loadu_si64(&cost_low_tbl[idx][ctx1][0]);
1310 __m128i rate_45 = _mm_loadu_si64(&cost_low_tbl[idx][ctx2][1]);
1311 __m128i rate_67 = _mm_loadu_si64(&cost_low_tbl[idx][ctx3][1]);
1312 __m128i rate_0123 = _mm_unpacklo_epi32(rate_01, rate_23);
1313 __m128i rate_4567 = _mm_unpacklo_epi32(rate_45, rate_67);
Joe Youngbdde8682024-08-07 09:33:48 -07001314 rate_0123 = _mm_unpacklo_epi16(rate_0123, c_zero);
1315 rate_4567 = _mm_unpacklo_epi16(rate_4567, c_zero);
Joe Young19695f32024-08-16 15:12:02 -07001316 _mm_storeu_si128((__m128i *)&rd->rate[8 * i], rate_0123);
1317 _mm_storeu_si128((__m128i *)&rd->rate[8 * i + 4], rate_4567);
Joe Youngbdde8682024-08-07 09:33:48 -07001318 }
1319
Joe Young19695f32024-08-16 15:12:02 -07001320 // Calc coeff/eob cost.
Joe Youngd3ef83a2024-08-23 11:49:06 -07001321 int eob_ctx = coeff_ctx->coef_eob;
Joe Young19695f32024-08-16 15:12:02 -07001322 __m128i rate_eob_coef = _mm_loadu_si64(&cost_eob_tbl[idx][eob_ctx][0]);
1323 rate_eob_coef = _mm_unpacklo_epi16(rate_eob_coef, c_zero);
1324 __m128i rate_eob_position = _mm_set1_epi32(eob_rate);
1325 __m128i rate_eob = _mm_add_epi32(rate_eob_coef, rate_eob_position);
1326 _mm_storeu_si64(&rd->rate_eob[0], rate_eob);
1327
Joe Youngbdde8682024-08-07 09:33:48 -07001328 // Calc coeff mid and high range cost.
1329 if (idx > 0 || plane) {
Joe Young3db806d2024-09-17 09:47:11 -07001330 for (int i = 0; i < n_states; i++) {
Joe Youngbdde8682024-08-07 09:33:48 -07001331 int a0 = i & 2 ? 1 : 0;
1332 int a1 = a0 + 2;
Joe Young33564432024-08-23 15:21:54 -07001333 int mid_cost0 = get_mid_cost_def(absLevel[a0], coeff_ctx->coef[i],
1334 txb_costs, plane, t_sign, sign);
1335 int mid_cost1 = get_mid_cost_def(absLevel[a1], coeff_ctx->coef[i],
1336 txb_costs, plane, t_sign, sign);
Joe Young19695f32024-08-16 15:12:02 -07001337 rd->rate[2 * i] += mid_cost0;
1338 rd->rate[2 * i + 1] += mid_cost1;
Joe Youngbdde8682024-08-07 09:33:48 -07001339 }
Joe Youngee165d22024-08-19 10:57:07 -07001340 int eob_mid_cost0 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[0], sign, 0,
1341 txb_costs, tx_class, t_sign, plane);
1342 int eob_mid_cost1 = get_mid_cost_eob(blk_pos, 0, 0, absLevel[2], sign, 0,
1343 txb_costs, tx_class, t_sign, plane);
Joe Young19695f32024-08-16 15:12:02 -07001344 rd->rate_eob[0] += eob_mid_cost0;
1345 rd->rate_eob[1] += eob_mid_cost1;
Joe Youngbdde8682024-08-07 09:33:48 -07001346 }
1347}
Joe Youngaf03d882024-08-13 11:11:36 -07001348
1349void av1_init_lf_ctx_avx2(const uint8_t *lev, int scan_hi, int bwl,
1350 struct tcq_lf_ctx_t *lf_ctx) {
1351 // Sample offsets (row/col) in and around the LF region used for ctx calc.
1352 const uint8_t diag_scan[21] = { 0x00, 0x10, 0x01, 0x20, 0x11, 0x02, 0x30,
1353 0x21, 0x12, 0x03, 0x40, 0x31, 0x22, 0x13,
1354 0x04, 0x50, 0x41, 0x32, 0x23, 0x14, 0x05 };
1355 const int8_t kShuf[16] = { 8, 6, 4, 2, 0, 11, 9, 7,
1356 5, 3, 1, -1, -1, -1, -1, -1 };
1357 __m128i zero = _mm_setzero_si128();
1358
1359 int eob_inside_lf_region = scan_hi < MAX_LF_SCAN - 1;
1360 if (eob_inside_lf_region) {
1361 // Retrive the EOB value and store in LF ctx.
1362 int row_col = diag_scan[scan_hi + 1];
1363 int row = row_col >> 4;
1364 int col = row_col & 15;
1365 int blk_pos = (row << bwl) + col;
1366 uint8_t lev0 = lev[get_padded_idx(blk_pos, bwl)];
1367 __m128i last = _mm_insert_epi8(zero, lev0, 0);
1368 _mm_storeu_si128((__m128i *)lf_ctx->last, last);
1369 } else {
1370 // Retrieve samples in the two diagonals bordering LF region.
1371 int offset = (1 << bwl) + TX_PAD_HOR - 1;
1372 const uint8_t *p = lev + 4;
1373 __m128i row0 = _mm_loadu_si64(p);
1374 __m128i row1 = _mm_loadu_si64(p + offset);
1375 __m128i row2 = _mm_loadu_si64(p + 2 * offset);
1376 __m128i row3 = _mm_loadu_si64(p + 3 * offset);
1377 __m128i row4 = _mm_loadu_si64(p + 4 * offset);
1378 __m128i row5 = _mm_loadu_si64(p + 5 * offset);
1379 __m128i row01 = _mm_unpacklo_epi16(row0, row1);
1380 __m128i row23 = _mm_unpacklo_epi16(row2, row3);
1381 __m128i row45 = _mm_unpacklo_epi16(row4, row5);
1382 __m128i row0123 = _mm_unpacklo_epi32(row01, row23);
1383 __m128i row012345 = _mm_unpacklo_epi64(row0123, row45);
1384 __m128i shuf = _mm_lddqu_si128((__m128i *)kShuf);
1385 __m128i last = _mm_shuffle_epi8(row012345, shuf);
1386 _mm_storeu_si128((__m128i *)lf_ctx->last, last);
1387 }
1388}
Joe Younga91384f2024-08-20 13:07:18 -07001389
1390// Pre-calculate eob bits (rate) for each EOB candidate position from 1
1391// to the initial eob location. Store rate in array block_eob_rate[],
1392// starting with index.
Joe Young33564432024-08-23 15:21:54 -07001393void av1_calc_block_eob_rate_avx2(struct macroblock *x, int plane,
1394 TX_SIZE tx_size, int eob,
Joe Younga91384f2024-08-20 13:07:18 -07001395 uint16_t *block_eob_rate) {
1396 const MACROBLOCKD *xd = &x->e_mbd;
1397 const MB_MODE_INFO *mbmi = xd->mi[0];
1398 const int is_inter = is_inter_block(mbmi, xd->tree_type);
1399 const PLANE_TYPE plane_type = get_plane_type(plane);
1400 const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
1401 const CoeffCosts *coeff_costs = &x->coeff_costs;
1402 const LV_MAP_COEFF_COST *txb_costs =
1403 &coeff_costs->coeff_costs[txs_ctx][plane_type];
1404 const int eob_multi_size = txsize_log2_minus4[tx_size];
1405 const LV_MAP_EOB_COST *txb_eob_costs =
1406 &coeff_costs->eob_costs[eob_multi_size][plane_type];
1407
1408#if CONFIG_EOB_POS_LUMA
1409 const int *tbl_eob_cost = txb_eob_costs->eob_cost[is_inter];
1410#else
1411 const int *tbl_eob_cost = txb_eob_costs->eob_cost;
1412#endif
Joe Young33564432024-08-23 15:21:54 -07001413 const int(*tbl_eob_extra)[2] = txb_costs->eob_extra_cost;
Joe Younga91384f2024-08-20 13:07:18 -07001414
1415 static const int8_t kShuf[4][32] = {
Joe Young33564432024-08-23 15:21:54 -07001416 { -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13,
1417 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5 },
1418 { 0, 1, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13,
1419 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
1420 { 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
1421 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13 },
Joe Younga91384f2024-08-20 13:07:18 -07001422 };
1423#define BC1 (1 << AV1_PROB_COST_SHIFT)
1424#define BC2 (2 * BC1)
1425 static const uint16_t kBitCost[16] = {
Joe Young33564432024-08-23 15:21:54 -07001426 0, 0, 0, 0, BC1, BC1, BC1, BC1, BC2, BC2, BC2, BC2, BC2, BC2, BC2, BC2
Joe Younga91384f2024-08-20 13:07:18 -07001427 };
1428
1429 // Write first 16 costs, block_eob_rate[0:15]
1430 // Convert 32-bit eob_pt costs { 0 1 2 3 4 5 6 7 } + eob_extra_cost
1431 // to expanded 16-bit costs { 0 1 2 2 3 3 3 3 4 4 4 4 4 4 4 4 }.
1432 __m256i eob_cost0_7 = _mm256_lddqu_si256((__m256i *)tbl_eob_cost);
1433 __m256i eob_extra0_7 = _mm256_lddqu_si256((__m256i *)tbl_eob_extra);
1434 __m256i shuf0 = _mm256_lddqu_si256((__m256i *)kShuf[0]);
1435 __m256i shuf1 = _mm256_lddqu_si256((__m256i *)kShuf[1]);
1436 __m256i eob_extra = _mm256_shuffle_epi8(eob_extra0_7, shuf0);
1437 __m256i eob_rate0_15 = _mm256_shuffle_epi8(eob_cost0_7, shuf1);
1438 eob_rate0_15 = _mm256_add_epi16(eob_rate0_15, eob_extra);
1439 __m256i bit_cost = _mm256_lddqu_si256((__m256i *)kBitCost);
1440 eob_rate0_15 = _mm256_add_epi16(eob_rate0_15, bit_cost);
1441 _mm256_storeu_si256((__m256i *)&block_eob_rate[0], eob_rate0_15);
1442
1443 // Write second 16 costs, block_eob_rate[16:31]
1444 __m256i eob_cost4_7 = _mm256_permute4x64_epi64(eob_cost0_7, 0xEE);
1445 __m256i eob_extra4_7 = _mm256_permute4x64_epi64(eob_extra0_7, 0xEE);
1446 __m256i shuf2 = _mm256_lddqu_si256((__m256i *)kShuf[2]);
1447 __m256i shuf3 = _mm256_set1_epi16(0x0504);
1448 __m256i eob_extra16_31 = _mm256_shuffle_epi8(eob_extra4_7, shuf2);
1449 __m256i eob_rate16_31 = _mm256_shuffle_epi8(eob_cost4_7, shuf3);
1450 eob_rate16_31 = _mm256_add_epi16(eob_rate16_31, eob_extra16_31);
1451 __m256i bit_cost16_31 = _mm256_set1_epi16(3 * BC1);
1452 eob_rate16_31 = _mm256_add_epi16(eob_rate16_31, bit_cost16_31);
1453 _mm256_storeu_si256((__m256i *)&block_eob_rate[16], eob_rate16_31);
1454
1455 // Write costs beyond position 32, block_eob_rate[32+]
1456 int scan_pos = 32;
1457 int n_offset_bits = 4;
1458 while (scan_pos < eob) {
1459 int eob_pt_rate = tbl_eob_cost[2 + n_offset_bits];
1460 for (int bit = 0; bit < 2; bit++) {
1461 int eob_ctx = n_offset_bits;
1462 int extra_bit_rate = tbl_eob_extra[eob_ctx][bit];
1463 int eob_rate =
1464 eob_pt_rate + extra_bit_rate + av1_cost_literal(n_offset_bits);
1465 for (int i = 0; i < (1 << n_offset_bits); i += 16) {
1466 __m256i rate = _mm256_set1_epi16(eob_rate);
1467 _mm256_storeu_si256((__m256i *)&block_eob_rate[scan_pos], rate);
1468 scan_pos += 16;
1469 }
1470 }
1471 n_offset_bits++;
1472 }
1473}
Joe Young33564432024-08-23 15:21:54 -07001474
1475static AOM_FORCE_INLINE int get_dqv(const int32_t *dequant, int coeff_idx,
1476 const qm_val_t *iqmatrix) {
1477 int dqv = dequant[!!coeff_idx];
1478 if (iqmatrix != NULL)
1479 dqv =
1480 ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
1481 return dqv;
1482}
1483
Joe Young3db806d2024-09-17 09:47:11 -07001484int av1_find_best_path_avx2(const struct tcq_node_t *trellis, int n_states_log2,
Joe Young33564432024-08-23 15:21:54 -07001485 const int16_t *scan, const int32_t *dequant,
1486 const qm_val_t *iqmatrix, const tran_low_t *tcoeff,
1487 int first_scan_pos, int log_scale,
1488 tran_low_t *qcoeff, tran_low_t *dqcoeff,
1489 int *min_rate, int64_t *min_cost) {
1490 // Select best trellis state.
Joe Young3db806d2024-09-17 09:47:11 -07001491 int n_states = 1 << n_states_log2;
Joe Young33564432024-08-23 15:21:54 -07001492 int64_t min_path_cost = INT64_MAX;
1493 int trel_min_rate = INT32_MAX;
1494 int prev_id = -2;
Joe Young3db806d2024-09-17 09:47:11 -07001495 for (int state = 0; state < n_states; state++) {
Joe Young33564432024-08-23 15:21:54 -07001496 const tcq_node_t *decision = &trellis[state];
1497 if (decision->rdCost < min_path_cost) {
1498 prev_id = state;
1499 min_path_cost = decision->rdCost;
1500 trel_min_rate = decision->rate;
1501 }
1502 }
1503
1504 // Backtrack to reconstruct qcoeff / dqcoeff blocks.
1505 int scan_pos = 0;
1506 if (!iqmatrix) {
1507 __m128i dqv = _mm_loadu_si64(dequant);
1508 __m128i dqv_ac = _mm_srli_si128(dqv, 4);
1509 __m128i zero = _mm_setzero_si128();
1510 __m128i round = _mm_set1_epi64x(1 << (QUANT_TABLE_BITS - 1));
1511 int shift = QUANT_TABLE_BITS + log_scale;
1512 for (; prev_id >= 0; scan_pos++) {
1513 const int32_t *decision =
Joe Young3db806d2024-09-17 09:47:11 -07001514 (int32_t *)&trellis[(scan_pos << n_states_log2) + prev_id];
Joe Young33564432024-08-23 15:21:54 -07001515 __m128i info = _mm_loadu_si64(&decision[3]);
1516 int blk_pos = scan[scan_pos];
1517 __m128i sign = _mm_loadu_si64(&tcoeff[blk_pos]);
1518 sign = _mm_srai_epi32(sign, 31);
1519 __m128i abs_lev = _mm_slli_epi32(info, 8);
1520 __m128i abs_lev2 = _mm_srli_epi32(abs_lev, 7);
1521 abs_lev = _mm_srli_epi32(abs_lev, 8);
Joe Youngbd3d1572024-09-12 12:57:14 -07001522 __m128i dq = _mm_slli_epi32(info, 6);
1523 dq = _mm_srli_epi32(dq, 31);
Joe Young33564432024-08-23 15:21:54 -07001524 __m128i dq_mask = _mm_srai_epi32(info, 31);
1525 dq = _mm_andnot_si128(dq_mask, dq);
1526 abs_lev2 = _mm_sub_epi32(abs_lev2, dq);
1527 abs_lev2 = _mm_max_epi32(abs_lev2, zero);
1528 __m128i dqc = _mm_mul_epi32(abs_lev2, dqv);
1529 dqc = _mm_add_epi64(dqc, round);
1530 dqc = _mm_srli_epi64(dqc, shift);
1531 dqc = _mm_xor_si128(dqc, sign);
1532 dqc = _mm_sub_epi32(dqc, sign);
1533 __m128i lev = _mm_xor_si128(abs_lev, sign);
1534 lev = _mm_sub_epi32(lev, sign);
1535#if 1
1536 // Older compilers don't implement _mm_storeu_si32()
1537 _mm_store_ss((float *)&qcoeff[blk_pos], _mm_castsi128_ps(lev));
1538 _mm_store_ss((float *)&dqcoeff[blk_pos], _mm_castsi128_ps(dqc));
1539#else
1540 _mm_storeu_si32(&qcoeff[blk_pos], lev);
1541 _mm_storeu_si32(&dqcoeff[blk_pos], dqc);
1542#endif
1543 dqv = dqv_ac;
1544 __m128i prevId = _mm_srai_epi32(info, 24);
1545 prev_id = _mm_extract_epi32(prevId, 0);
1546 }
1547 } else {
1548 for (; prev_id >= 0; scan_pos++) {
Joe Young3db806d2024-09-17 09:47:11 -07001549 const tcq_node_t *decision =
1550 &trellis[(scan_pos << n_states_log2) + prev_id];
Joe Young33564432024-08-23 15:21:54 -07001551 prev_id = decision->prevId;
1552 int abs_level = decision->absLevel;
1553 int blk_pos = scan[scan_pos];
1554 int sign = tcoeff[blk_pos] < 0;
1555 qcoeff[blk_pos] = sign ? -abs_level : abs_level;
1556 int dqv = get_dqv(dequant, blk_pos, iqmatrix);
1557 int dq = prev_id >= 0 ? tcq_quant(prev_id) : 0;
1558 int qc = (abs_level == 0) ? 0 : (2 * abs_level - dq);
1559 int dqc = (tran_low_t)ROUND_POWER_OF_TWO_64((tran_high_t)qc * dqv,
1560 QUANT_TABLE_BITS) >>
1561 log_scale;
1562 dqcoeff[blk_pos] = sign ? -dqc : dqc;
1563 }
1564 }
1565 int eob = scan_pos;
1566
1567 for (; scan_pos <= first_scan_pos; scan_pos++) {
1568 int blk_pos = scan[scan_pos];
1569 qcoeff[blk_pos] = 0;
1570 dqcoeff[blk_pos] = 0;
1571 }
1572
1573 *min_rate = trel_min_rate;
1574 *min_cost = min_path_cost;
1575 return eob;
1576}