hbd lpf sse2 perf and code quality improvement
aom_highbd_lpf_horizontal_14_sse2 -1.15x perf
due to full sse2 register width usage for some ops
highbd_hev_filter_mask_x_sse2 added for code quality
if blocks in _4,_6,_8 and _14 eliminated
Change-Id: Ie28a70798833c95fb21cac238ffdebfcead5f0a7
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 27adf67..83e0098 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -80,24 +80,9 @@
}
}
-static INLINE void highbd_hev_mask(const __m128i *p0q0, const __m128i *p1q1,
- const __m128i *t, __m128i *abs_p1p0,
- __m128i *hev) {
- *abs_p1p0 = abs_diff16(*p1q1, *p0q0);
- __m128i abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
- __m128i h = _mm_max_epi16(*abs_p1p0, abs_q1q0);
- h = _mm_subs_epu16(h, *t);
-
- const __m128i ffff = _mm_set1_epi16(0xFFFF);
- const __m128i zero = _mm_setzero_si128();
- *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
- // replicate for the further "merged variables" usage
- *hev = _mm_unpacklo_epi64(*hev, *hev);
-}
-
-static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
- const __m128i *l, const __m128i *bl,
- __m128i *mask) {
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+ const __m128i *l, const __m128i *bl,
+ __m128i *mask) {
__m128i abs_p0q0 = abs_diff16(p[0], q[0]);
__m128i abs_p1q1 = abs_diff16(p[1], q[1]);
abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
@@ -106,6 +91,7 @@
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_set1_epi16(0xFFFF);
+
__m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
@@ -119,9 +105,76 @@
*mask = _mm_cmpeq_epi16(max, zero); // return ~mask
}
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
- const __m128i *q, int bd, int start,
- int end, __m128i *flat) {
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+ __m128i *p1p0, __m128i *q1q0,
+ __m128i *abs_p1p0, __m128i *l,
+ __m128i *bl, __m128i *t,
+ __m128i *hev, __m128i *mask) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16(0xFFFF);
+ __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+ __m128i max, max01, h;
+
+ *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+ *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+ abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2
+
+ max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+ abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+ max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+ // mask |= (abs(*p1 - *p0) > limit) * -1;
+ // mask |= (abs(*q1 - *q0) > limit) * -1;
+ h = _mm_subs_epu16(max01, *t);
+
+ *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+ // replicate for the further "merged variables" usage
+ *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+ max = _mm_max_epi16(max, max01);
+ int i;
+ for (i = 2; i < x; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+ int start, int end, __m128i *flat) {
+ int i;
+ __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+ abs_diff16(pq[start + 1], pq[0]));
+
+ for (i = start + 2; i < end; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ __m128i ft;
+ ft = _mm_subs_epu16(max, *th);
+
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+ const __m128i *q, int start, int end,
+ __m128i *flat) {
int i;
__m128i max =
_mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
@@ -132,31 +185,29 @@
}
__m128i ft;
- if (bd == 8)
- ft = _mm_subs_epu16(max, *th);
- else if (bd == 10)
- ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
- else // bd == 12
- ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
+ ft = _mm_subs_epu16(max, *th);
const __m128i zero = _mm_setzero_si128();
*flat = _mm_cmpeq_epi16(ft, zero);
}
-// Note:
-// Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
- const __m128i *q, __m128i *flat, int bd) {
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+ __m128i *flat2, int bd) {
// check the distance 1,2,3 against 0
- flat_mask_internal(th, p, q, bd, 1, 4, flat);
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal(&th, pq, 1, 4, flat);
+ flat_mask_internal(&th, pq, 4, 7, flat2);
}
-// Note:
-// access p[6-4], p[0], and q[6-4], q[0]
-static INLINE void highbd_flat_mask4_13(const __m128i *th, const __m128i *p,
- const __m128i *q, __m128i *flat,
- int bd) {
- flat_mask_internal(th, p, q, bd, 4, 7, flat);
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+ const __m128i *q, __m128i *flat,
+ __m128i *flat2, int bd) {
+ // check the distance 1,2,3 against 0
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+ flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
}
static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
@@ -280,13 +331,21 @@
__m128i t80;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
- __m128i mask;
- highbd_filter_mask(p, q, &limit, &blimit, &mask);
+ for (i = 0; i < 7; i++) {
+ pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+ }
+ __m128i mask, hevhev;
+ __m128i p1p0, q1q0, abs_p1p0;
+
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hevhev, &mask);
+
+ __m128i ps0ps1, qs0qs1;
+ // filter4
+ highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
__m128i flat, flat2;
- const __m128i one = _mm_set1_epi16(1);
- highbd_flat_mask4(&one, p, q, &flat, bd);
- highbd_flat_mask4_13(&one, p, q, &flat2, bd);
+ highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
flat = _mm_and_si128(flat, mask);
flat2 = _mm_and_si128(flat2, flat);
@@ -295,68 +354,50 @@
flat = _mm_unpacklo_epi64(flat, flat);
flat2 = _mm_unpacklo_epi64(flat2, flat2);
- __m128i ps0ps1, qs0qs1, p1p0, q1q0;
-
- // filters - hev and filter4
- __m128i hevhev;
- __m128i abs_p1p0;
- for (i = 0; i < 6; i++) {
- pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
- }
-
- highbd_hev_mask(&pq[0], &pq[1], &thresh, &abs_p1p0, &hevhev);
-
- p1p0 = _mm_unpacklo_epi64(p[0], p[1]);
- q1q0 = _mm_unpacklo_epi64(q[0], q[1]);
- highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
// flat and wide flat calculations
__m128i flat_p[3], flat_q[3], flat_pq[3];
__m128i flat2_p[6], flat2_q[6];
__m128i flat2_pq[6];
-
{
+ __m128i work0;
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
- __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
-
- __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+ __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+ __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
sum_p = _mm_add_epi16(sum_p, sum_lp);
- __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
- sum_q = _mm_add_epi16(sum_q, sum_lq);
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0])));
- flat2_q[0] = _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1])));
+ work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
+ flat2_q[0] =
+ _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
- __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
- __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
- __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
- __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+ __m128i sum_p6, sum_p3;
+ sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+ sum_p3 = _mm_add_epi16(pq[3], pq[3]);
sum_q = _mm_sub_epi16(sum_p, p[5]);
sum_p = _mm_sub_epi16(sum_p, q[5]);
- flat2_p[1] = _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0]))));
- flat2_q[1] = _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2]))));
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+ flat2_p[1] = _mm_add_epi16(sum_p, work0);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
sum_lq = _mm_sub_epi16(sum_lp, p[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
- flat_p[1] = _mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1]));
- flat_q[1] = _mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1]));
+ work0 = _mm_add_epi16(sum_p3, pq[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
@@ -364,61 +405,54 @@
flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
sum_p = _mm_sub_epi16(sum_p, q[4]);
sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1]))));
- flat2_q[2] = _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3]))));
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
sum_lq = _mm_sub_epi16(sum_lq, p[1]);
- flat_p[2] = _mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2]));
- flat_q[2] = _mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2]));
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+ work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
sum_p = _mm_sub_epi16(sum_p, q[3]);
sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2]))));
- flat2_q[3] = _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4]))));
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
sum_p = _mm_sub_epi16(sum_p, q[2]);
sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3]))));
- flat2_q[4] = _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5]))));
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
sum_p = _mm_sub_epi16(sum_p, q[1]);
sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4]))));
- flat2_q[5] = _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6]))));
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
}
@@ -469,11 +503,10 @@
get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
&t80);
__m128i mask;
- highbd_filter_mask(p, q, &limit, &blimit, &mask);
+ highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
__m128i flat, flat2;
- const __m128i one = _mm_set1_epi16(1);
- highbd_flat_mask4(&one, p, q, &flat, bd);
- highbd_flat_mask4_13(&one, p, q, &flat2, bd);
+ highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
flat = _mm_and_si128(flat, mask);
flat2 = _mm_and_si128(flat2, flat);
__m128i ps[2], qs[2];
@@ -643,69 +676,35 @@
__m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
__m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
const uint8_t *_limit, const uint8_t *_thresh, int bd) {
- const __m128i zero = _mm_setzero_si128();
__m128i blimit, limit, thresh;
__m128i mask, hev, flat;
- __m128i q2p2, q1p1, q0p0;
- __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+ __m128i pq[3];
+ __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
__m128i flat_p1p0, flat_q0q1;
- q2p2 = _mm_unpacklo_epi64(*p2, *q2);
- q1p1 = _mm_unpacklo_epi64(*p1, *q1);
- q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
- __m128i abs_p1q1, abs_p0q0, abs_p1p0, work;
-
+ const __m128i zero = _mm_setzero_si128();
const __m128i four = _mm_set1_epi16(4);
__m128i t80;
const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
- // filter_mask and hev_mask
- highbd_hev_mask(&q0p0, &q1p1, &thresh, &abs_p1p0, &hev);
-
- abs_p0q0 = abs_diff16(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
- mask = _mm_max_epi16(abs_p1p0, mask);
- // mask |= (abs(*p1 - *p0) > limit) * -1;
- // mask |= (abs(*q1 - *q0) > limit) * -1;
-
- work = abs_diff16(q2p2, q1p1);
-
- mask = _mm_max_epi16(work, mask);
- mask = _mm_max_epi16(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu16(mask, limit);
- mask = _mm_cmpeq_epi16(mask, zero);
+ highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
// flat_mask
- flat = _mm_max_epi16(abs_diff16(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
- if (bd == 8)
- flat = _mm_subs_epu16(flat, one);
- else if (bd == 10)
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(
- flat, mask); // flat & mask
- // replicate for the further "merged variables" usage
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
{
@@ -802,12 +801,7 @@
flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
flat = _mm_max_epi16(flat, work);
- if (bd == 8)
- flat = _mm_subs_epu16(flat, one);
- else if (bd == 10)
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
@@ -930,67 +924,36 @@
const __m128i zero = _mm_setzero_si128();
__m128i blimit, limit, thresh;
__m128i mask, hev, flat;
- __m128i q2p2, q1p1, q0p0, q3p3;
+ __m128i pq[4];
__m128i p1p0, q1q0, ps1ps0, qs1qs0;
__m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
- q3p3 = _mm_unpacklo_epi64(*p3, *q3);
- q2p2 = _mm_unpacklo_epi64(*p2, *q2);
- q1p1 = _mm_unpacklo_epi64(*p1, *q1);
- q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+ pq[3] = _mm_unpacklo_epi64(*p3, *q3);
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
- __m128i abs_p1q1, abs_p0q0, abs_p1p0, work;
+ __m128i abs_p1p0;
const __m128i four = _mm_set1_epi16(4);
__m128i t80;
const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
- // filter_mask and hev_mask
- highbd_hev_mask(&q0p0, &q1p1, &thresh, &abs_p1p0, &hev);
-
- abs_p0q0 = abs_diff16(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
- mask = _mm_max_epi16(abs_p1p0, mask);
- // mask |= (abs(*p1 - *p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epi16(abs_diff16(q2p2, q1p1), abs_diff16(q3p3, q2p2));
- mask = _mm_max_epi16(work, mask);
- mask = _mm_max_epi16(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu16(mask, limit);
- mask = _mm_cmpeq_epi16(mask, zero);
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
// flat_mask4
- flat = _mm_max_epi16(abs_diff16(q2p2, q0p0), abs_diff16(q3p3, q0p0));
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
flat = _mm_max_epi16(abs_p1p0, flat);
flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
- if (bd == 8)
- flat = _mm_subs_epu16(flat, one);
- else if (bd == 10)
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(
- flat, mask); // flat & mask
- // replicate for the further "merged variables" usage
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
{
@@ -1100,12 +1063,7 @@
work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
flat = _mm_max_epi16(work0, flat);
- if (bd == 8)
- flat = _mm_subs_epu16(flat, one);
- else if (bd == 10)
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
- else // bd == 12
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
@@ -1240,47 +1198,20 @@
__m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
__m128i blimit, limit, thresh;
- __m128i mask, hev, flat;
+ __m128i mask, hev;
__m128i p1p0, q1q0;
+ __m128i pq[2];
- const __m128i zero = _mm_setzero_si128();
-
- __m128i abs_p0q0, abs_p1q1, abs_p1p0, abs_q1q0;
-
- const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
- const __m128i one = _mm_set1_epi16(1);
+ __m128i abs_p1p0;
__m128i t80;
get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
- p1p0 = _mm_unpacklo_epi64(*p0, *p1);
- q1q0 = _mm_unpacklo_epi64(*q0, *q1);
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
- abs_p1p0 = abs_diff16(*p1, *p0);
- abs_q1q0 = abs_diff16(*q1, *q0);
-
- abs_p0q0 = abs_diff16(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-
- // filter_mask and hev_mask
- flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu16(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
- mask = _mm_max_epi16(flat, mask);
-
- mask = _mm_subs_epu16(mask, limit);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- mask = _mm_unpacklo_epi64(mask, mask);
- hev = _mm_unpacklo_epi64(hev, hev);
+ highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
}
@@ -1611,13 +1542,15 @@
&d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
_mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
-
_mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
_mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
_mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
_mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
}