Highbd parallel_deblocking sse2 optimization - Decoder speed improves ~13.7% (baseline + parallel_deblocking). - Highbd loopfilter AVX2 version works when this experiment is disabled. Change-Id: I5d56b137a1d52236a4735656c370d57ef71ae043
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c index b776897..94c6888 100644 --- a/aom_dsp/x86/highbd_loopfilter_avx2.c +++ b/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -16,6 +16,7 @@ #include "aom_dsp/x86/lpf_common_sse2.h" #include "aom/aom_integer.h" +#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, const uint8_t *t, int bd, __m256i *blt, __m256i *lt, __m256i *thr) { @@ -200,7 +201,54 @@ qs[1] = _mm256_adds_epi16(qs1, t80); ps[1] = _mm256_adds_epi16(ps1, t80); } +#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} +#else void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, const uint8_t *blt, const uint8_t *lt, @@ -822,3 +870,4 @@ // Transpose back highbd_transpose(src, 16, dst, p, 2); } +#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c index 68668ab..0a399ed 100644 --- a/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -31,100 +31,116 @@ *pixel = _mm_or_si128(clamped, mask); } -// TODO(debargha, peter): Break up large functions into smaller ones -// in this file. -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; - __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; - __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; - __m128i ps1, qs1, ps0, qs0; - __m128i abs_p0q0, abs_p1q1, ffff, work; - __m128i filt, work_a, filter1, filter2; - __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; - __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; - __m128i flat2_q0, flat2_p0; - __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3; - __m128i t4, t3, t80, t1; - __m128i eight, four; +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); + } +} +// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); +static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, + const __m128i *t, __m128i *hev) { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *t); + + const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i zero = _mm_setzero_si128(); + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); +} + +static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), + _mm_subs_epu16(p[i - 1], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), + _mm_subs_epu16(q[i - 1], q[i]))); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, + const __m128i *q, int bd, int start, + int end, __m128i *flat) { + __m128i max = _mm_setzero_si128(); + int i; + for (i = start; i < end; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), + _mm_subs_epu16(p[0], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), + _mm_subs_epu16(q[0], q[i]))); } - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + __m128i ft; + if (bd == 8) + ft = _mm_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); - // highbd_filter_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} - ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} - // highbd_hev_mask (in C code this is actually called from highbd_filter4) - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); - - mask = _mm_subs_epu16(mask, limit); - mask = _mm_cmpeq_epi16(mask, zero); // return ~mask - - // lp filter - // highbd_filter4 - t4 = _mm_set1_epi16(4); - t3 = _mm_set1_epi16(3); +static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, + const __m128i *th, int bd, __m128i *ps, + __m128i *qs) { + __m128i t80; if (bd == 8) t80 = _mm_set1_epi16(0x80); else if (bd == 10) @@ -132,360 +148,283 @@ else // bd == 12 t80 = _mm_set1_epi16(0x800); - t1 = _mm_set1_epi16(0x1); + __m128i ps0 = _mm_subs_epi16(p[0], t80); + __m128i ps1 = _mm_subs_epi16(p[1], t80); + __m128i qs0 = _mm_subs_epi16(q[0], t80); + __m128i qs1 = _mm_subs_epi16(q[1], t80); - ps1 = _mm_subs_epi16(p1, t80); - qs1 = _mm_subs_epi16(q1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); - + const __m128i one = _mm_set1_epi16(1); const __m128i pmax = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i zero = _mm_setzero_si128(); const __m128i pmin = _mm_subs_epi16(zero, t80); - filt = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, hev); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); + __m128i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm_and_si128(filter, hev); - filt = _mm_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); - filt = _mm_and_si128(filt, mask); + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); - filter1 = _mm_adds_epi16(filt, t4); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); pixel_clamp(&pmin, &pmax, &filter1); - filter2 = _mm_adds_epi16(filt, t3); pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - filter1 = _mm_srai_epi16(filter1, 0x3); - filter2 = _mm_srai_epi16(filter2, 0x3); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); qs0 = _mm_subs_epi16(qs0, filter1); pixel_clamp(&pmin, &pmax, &qs0); - qs0 = _mm_adds_epi16(qs0, t80); - ps0 = _mm_adds_epi16(ps0, filter2); pixel_clamp(&pmin, &pmax, &ps0); - ps0 = _mm_adds_epi16(ps0, t80); - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); + qs[0] = _mm_adds_epi16(qs0, t80); + ps[0] = _mm_adds_epi16(ps0, t80); - qs1 = _mm_subs_epi16(qs1, filt); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); + + qs1 = _mm_subs_epi16(qs1, filter); pixel_clamp(&pmin, &pmax, &qs1); - qs1 = _mm_adds_epi16(qs1, t80); - - ps1 = _mm_adds_epi16(ps1, filt); + ps1 = _mm_adds_epi16(ps1, filter); pixel_clamp(&pmin, &pmax, &ps1); - ps1 = _mm_adds_epi16(ps1, t80); - // end highbd_filter4 - // loopfilter done + qs[1] = _mm_adds_epi16(qs1, t80); + ps[1] = _mm_adds_epi16(ps1, t80); +} - // highbd_flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); - work = _mm_max_epi16(abs_p1p0, abs_q1q0); - flat = _mm_max_epi16(work, flat); +typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); +static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd, + PixelOutput pixel_output) { + __m128i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - flat = _mm_cmpeq_epi16(flat, zero); - // end flat_mask4 + __m128i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); - // flat & mask = flat && mask (as used in filter8) - // (because, in both vars, each block of 16 either all 1s or all 0s) + __m128i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m128i flat, flat2; + const __m128i one = _mm_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + __m128i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 - // but referred to as p0-p4 & q0-q4 in fn) - flat2 = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), - _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); - - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), - _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); - flat2 = _mm_max_epi16(work, flat2); - - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), - _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); - flat2 = _mm_max_epi16(work, flat2); - - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), - _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); - flat2 = _mm_max_epi16(work, flat2); - - if (bd == 8) - flat2 = _mm_subs_epu16(flat2, one); - else if (bd == 10) - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); - - flat2 = _mm_cmpeq_epi16(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - // end highbd_flat_mask5 - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations - eight = _mm_set1_epi16(8); - four = _mm_set1_epi16(4); + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[7], flat2_q[7]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); + __m128i sum_p = + _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); + __m128i sum_q = + _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); - pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); - pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - flat2_p0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); - flat2_q0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); - flat_p0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); - flat_q0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - sum_p7 = _mm_add_epi16(p7, p7); - sum_q7 = _mm_add_epi16(q7, q7); - sum_p3 = _mm_add_epi16(p3, p3); - sum_q3 = _mm_add_epi16(q3, q3); + flat2_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); + flat2_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); - flat2_p1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); - flat2_q1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); + __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); + __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); - flat_p1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); - flat_q1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); + sum_q = _mm_sub_epi16(sum_p, p[6]); + sum_p = _mm_sub_epi16(sum_p, q[6]); + flat2_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - sum_p3 = _mm_add_epi16(sum_p3, p3); - sum_q3 = _mm_add_epi16(sum_q3, q3); + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); - flat2_p2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); - flat2_q2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); - flat_p2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); - flat_q2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); + sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_q, p[5]); + flat2_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); - flat2_p3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); - flat2_q3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); - flat2_p4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); - flat2_q4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p = _mm_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); + } + } - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); - flat2_p5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); - flat2_q5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); - flat2_p6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); - flat2_q6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // highbd_filter8 - p2 = _mm_andnot_si128(flat, p2); + // highbd_filter8 + p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) - flat_p2 = _mm_and_si128(flat, flat_p2); + flat_p[2] = _mm_and_si128(flat, flat_p[2]); // when (flat && mask) - p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - ps1 = _mm_andnot_si128(flat, ps1); - // p1 takes the value assigned to in in filter4 if !(flat && mask) - flat_p1 = _mm_and_si128(flat, flat_p1); - // when (flat && mask) - p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values - qs1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values - - ps0 = _mm_andnot_si128(flat, ps0); - // p0 takes the value assigned to in in filter4 if !(flat && mask) - flat_p0 = _mm_and_si128(flat, flat_p0); - // when (flat && mask) - p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values - qs0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values - // end highbd_filter8 + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } // highbd_filter16 - p6 = _mm_andnot_si128(flat2, p6); - // p6 remains unchanged if !(flat2 && flat && mask) - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - // get values for when (flat2 && flat && mask) - p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values - q6 = _mm_andnot_si128(flat2, q6); - // q6 remains unchanged if !(flat2 && flat && mask) - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - // get values for when (flat2 && flat && mask) - q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); - p5 = _mm_andnot_si128(flat2, p5); - // p5 remains unchanged if !(flat2 && flat && mask) - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - // get values for when (flat2 && flat && mask) - p5 = _mm_or_si128(p5, flat2_p5); - // full list of p5 values - q5 = _mm_andnot_si128(flat2, q5); - // q5 remains unchanged if !(flat2 && flat && mask) - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - // get values for when (flat2 && flat && mask) - q5 = _mm_or_si128(q5, flat2_q5); - // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); + if (pixel_output == FOUR_PIXELS) { + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - p4 = _mm_andnot_si128(flat2, p4); - // p4 remains unchanged if !(flat2 && flat && mask) - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - // get values for when (flat2 && flat && mask) - p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values - q4 = _mm_andnot_si128(flat2, q4); - // q4 remains unchanged if !(flat2 && flat && mask) - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - // get values for when (flat2 && flat && mask) - q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); + } + } else { // EIGHT_PIXELS + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - p3 = _mm_andnot_si128(flat2, p3); - // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - // get values for when (flat2 && flat && mask) - p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values - q3 = _mm_andnot_si128(flat2, q3); - // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - // get values for when (flat2 && flat && mask) - q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } + } +} - p2 = _mm_andnot_si128(flat2, p2); - // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - // get values for when (flat2 && flat && mask) - p2 = _mm_or_si128(p2, flat2_p2); - // full list of p2 values - q2 = _mm_andnot_si128(flat2, q2); - // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - // get values for when (flat2 && flat && mask) - q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); +// Note: +// highbd_lpf_horz_edge_8_8p() output 8 pixels per register +// highbd_lpf_horz_edge_8_4p() output 4 pixels per register +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); +} +#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - p1 = _mm_andnot_si128(flat2, p1); - // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - // get values for when (flat2 && flat && mask) - p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values - q1 = _mm_andnot_si128(flat2, q1); - // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - // get values for when (flat2 && flat && mask) - q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); +static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); +} - p0 = _mm_andnot_si128(flat2, p0); - // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - // get values for when (flat2 && flat && mask) - p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values - q0 = _mm_andnot_si128(flat2, q0); - // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - // get values for when (flat2 && flat && mask) - q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); +#endif } void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { - aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); - aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); + highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); +#endif +} + +static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, + const __m128i *p0, const __m128i *q0, + const __m128i *q1, const __m128i *q2, + int p, uint16_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); +#else + _mm_store_si128((__m128i *)(s - 3 * p), *p2); + _mm_store_si128((__m128i *)(s - 2 * p), *p1); + _mm_store_si128((__m128i *)(s - 1 * p), *p0); + _mm_store_si128((__m128i *)(s + 0 * p), *q0); + _mm_store_si128((__m128i *)(s + 1 * p), *q1); + _mm_store_si128((__m128i *)(s + 2 * p), *q2); +#endif } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -501,14 +440,14 @@ const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -715,12 +654,7 @@ p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); } void aom_highbd_lpf_horizontal_8_dual_sse2( @@ -738,14 +672,18 @@ const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -756,7 +694,7 @@ _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - __m128i work; + const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); __m128i t80; @@ -827,9 +765,9 @@ // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); mask = _mm_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16( + +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) + __m128i work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); mask = _mm_max_epi16(work, mask); @@ -837,6 +775,7 @@ _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); +#endif mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); @@ -902,11 +841,17 @@ p1 = _mm_adds_epi16(ps1, filt); pixel_clamp(&pmin, &pmax, &p1); p1 = _mm_adds_epi16(p1, t80); - +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); +#else _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +#endif } void aom_highbd_lpf_horizontal_4_dual_sse2( @@ -1047,10 +992,12 @@ highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - // Loop filtering +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); +#else aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); - +#endif // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c index 9d702fe..8d1b76c 100644 --- a/av1/common/av1_loopfilter.c +++ b/av1/common/av1_loopfilter.c
@@ -2997,9 +2997,9 @@ case 4: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(filt_start), - line_length, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_4_c(filt_start, line_length, params.mblim, @@ -3009,9 +3009,9 @@ case 8: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(filt_start), - line_length, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_8_c(filt_start, line_length, params.mblim, @@ -3021,9 +3021,9 @@ case 16: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_16_c( - CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim, - params.lim, params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_16_c(filt_start, line_length, params.mblim, @@ -3055,9 +3055,9 @@ uint8_t *const filt_start = block + pivot; #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(filt_start), - line_length, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_4_c(filt_start, line_length, params.mblim, @@ -3067,15 +3067,15 @@ if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i]; } } -#else // CONFIG_LPF_DIRECT +#else // !CONFIG_LPF_DIRECT switch (params.filter_length) { // apply 4-tap filtering case 4: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p), dst_stride, - params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_4_c(p, dst_stride, params.mblim, params.lim, @@ -3085,9 +3085,9 @@ case 8: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(p), dst_stride, - params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_8_c(p, dst_stride, params.mblim, params.lim, @@ -3097,9 +3097,9 @@ case 16: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride, - params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, params.hev_thr, + cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim, @@ -3112,9 +3112,9 @@ if (params.filter_length_internal) { #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p + 4), dst_stride, - params.mblim, params.lim, params.hev_thr, - cm->bit_depth); + aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p + 4), dst_stride, + params.mblim, params.lim, params.hev_thr, + cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_vertical_4_c(p + 4, dst_stride, params.mblim, params.lim, @@ -3183,9 +3183,9 @@ case 4: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_4_c( - CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim, - params.lim, params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_4_c(filt_start, line_length, params.mblim, @@ -3195,9 +3195,9 @@ case 8: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_8_c( - CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim, - params.lim, params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_8_c(filt_start, line_length, params.mblim, @@ -3207,7 +3207,7 @@ case 16: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_edge_16_c( + aom_highbd_lpf_horizontal_edge_16( CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim, params.lim, params.hev_thr, cm->bit_depth); else @@ -3241,9 +3241,9 @@ uint8_t *const filt_start = block + pivot * line_length; #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(filt_start), - line_length, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start), + line_length, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_4_c(filt_start, line_length, params.mblim, @@ -3253,15 +3253,15 @@ if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i]; } } -#else // CONFIG_LPF_DIRECT +#else // !CONFIG_LPF_DIRECT switch (params.filter_length) { // apply 4-tap filtering case 4: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p), dst_stride, - params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_4_c(p, dst_stride, params.mblim, params.lim, @@ -3271,9 +3271,9 @@ case 8: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_8_c(CONVERT_TO_SHORTPTR(p), dst_stride, - params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_8_c(p, dst_stride, params.mblim, params.lim, @@ -3283,7 +3283,7 @@ case 16: #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_edge_16_c( + aom_highbd_lpf_horizontal_edge_16( CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim, params.hev_thr, cm->bit_depth); else @@ -3298,9 +3298,9 @@ if (params.filter_length_internal) { #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) - aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p + 4 * dst_stride), - dst_stride, params.mblim, params.lim, - params.hev_thr, cm->bit_depth); + aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p + 4 * dst_stride), + dst_stride, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); else #endif // CONFIG_HIGHBITDEPTH aom_lpf_horizontal_4_c(p + 4 * dst_stride, dst_stride, params.mblim,
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 30ef04a..5cbd92e 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc
@@ -499,7 +499,7 @@ using std::tr1::make_tuple; -#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING) +#if HAVE_SSE2 #if CONFIG_HIGHBITDEPTH const loop8_param_t kHbdLoop8Test6[] = { @@ -550,6 +550,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param, ::testing::ValuesIn(kHbdLoop8Test6)); #else +#if !CONFIG_PARALLEL_DEBLOCKING const loop8_param_t kLoop8Test6[] = { make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8), make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8), @@ -564,10 +565,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param, ::testing::ValuesIn(kLoop8Test6)); -#endif // CONFIG_HIGHBITDEPTH #endif +#endif // CONFIG_HIGHBITDEPTH +#endif // HAVE_SSE2 -#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING) +#if HAVE_AVX2 #if CONFIG_HIGHBITDEPTH const loop8_param_t kHbdLoop8Test6Avx2[] = { @@ -600,7 +602,7 @@ &aom_lpf_horizontal_edge_16_c, 8))); #endif -#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING) +#if HAVE_SSE2 #if CONFIG_HIGHBITDEPTH const dualloop8_param_t kHbdLoop8Test9[] = { make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, @@ -632,6 +634,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param, ::testing::ValuesIn(kHbdLoop8Test9)); #else +#if !CONFIG_PARALLEL_DEBLOCKING const dualloop8_param_t kLoop8Test9[] = { make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8), make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8), @@ -641,10 +644,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param, ::testing::ValuesIn(kLoop8Test9)); -#endif // CONFIG_HIGHBITDEPTH #endif +#endif // CONFIG_HIGHBITDEPTH +#endif // HAVE_SSE2 -#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING) +#if HAVE_AVX2 #if CONFIG_HIGHBITDEPTH const dualloop8_param_t kHbdLoop8Test9Avx2[] = { make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,