Read only required pixels in loop-filter intrinsics
Loop-filter intrinsics are modified to read only
the required pixels. Reading extra pixels causes
data race when loop-filter is multi-threaded.
Change-Id: I02935e6a87cec8ef2a347d66f72cb8169d04a67a
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 739840d..95d8a7d 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -454,7 +454,11 @@
const uint8_t *thr, int bd) {
__m128i p[7], q[7], pq[7];
int i;
- load_highbd_pixel(s, 7, pitch, p, q);
+
+ for (i = 0; i < 7; i++) {
+ p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+ }
highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
@@ -1355,10 +1359,10 @@
const uint8_t *_blimit,
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
- __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, _blimit, _limit, _thresh, bd);
@@ -1395,10 +1399,10 @@
x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
- x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
- x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
- x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
- x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+ x4 = _mm_setzero_si128();
+ x5 = _mm_setzero_si128();
+ x6 = _mm_setzero_si128();
+ x7 = _mm_setzero_si128();
highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
&d2, &d3, &d4, &d5, &d6, &d7);
@@ -1461,8 +1465,8 @@
p1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
p0 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
q0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
- q1 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
- q2 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+ q1 = _mm_setzero_si128();
+ q2 = _mm_setzero_si128();
highbd_transpose6x6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &d0, &d1, &d2, &d3,
&d4, &d5);
@@ -1497,10 +1501,10 @@
p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
- q0 = _mm_loadu_si128((__m128i *)((s - 4) + 4 * p));
- q1 = _mm_loadu_si128((__m128i *)((s - 4) + 5 * p));
- q2 = _mm_loadu_si128((__m128i *)((s - 4) + 6 * p));
- q3 = _mm_loadu_si128((__m128i *)((s - 4) + 7 * p));
+ q0 = _mm_setzero_si128();
+ q1 = _mm_setzero_si128();
+ q2 = _mm_setzero_si128();
+ q3 = _mm_setzero_si128();
highbd_transpose8x8_sse2(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3, &d0, &d1,
&d2, &d3, &d4, &d5, &d6, &d7);
@@ -1571,10 +1575,10 @@
p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
- p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
- p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
- p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
- q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+ p2 = _mm_setzero_si128();
+ p1 = _mm_setzero_si128();
+ p0 = _mm_setzero_si128();
+ q0 = _mm_setzero_si128();
highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
&p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
@@ -1583,10 +1587,10 @@
p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
- p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
- p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
- p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
- q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+ p2_2 = _mm_setzero_si128();
+ p1_2 = _mm_setzero_si128();
+ p0_2 = _mm_setzero_si128();
+ q0_2 = _mm_setzero_si128();
highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
&q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 87a2d57..3450dca 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -151,10 +151,10 @@
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
__m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+ q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+ q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 0 * p)));
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
FILTER_HEV_MASK4;
@@ -188,14 +188,8 @@
x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
- // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
- // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
+ x2 = _mm_setzero_si128();
+ x3 = _mm_setzero_si128();
// Transpose 8x8
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
p1p0 = _mm_unpacklo_epi16(q1q0, x1);
@@ -575,23 +569,23 @@
const unsigned char *_thresh) {
__m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
- _mm_loadl_epi64((__m128i *)(s + 4 * p)));
- q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+ q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+ q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+ q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+ q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+ q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+ _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
- q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
- _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+ q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
- q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
- _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+ q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+ _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, _blimit,
_limit, _thresh);
@@ -776,12 +770,12 @@
__m128i p2, p1, p0, q0, q1, q2;
__m128i p1p0, q1q0;
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+ p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+ p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+ q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+ q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+ q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, _blimit,
_limit, _thresh);
@@ -953,14 +947,14 @@
const unsigned char *_thresh) {
__m128i p2_8, p1_8, p0_8, q0_8, q1_8, q2_8, p3_8, q3_8;
__m128i q1q0, p1p0, p2, q2;
- p3_8 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- p2_8 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1_8 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0_8 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0_8 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1_8 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2_8 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- q3_8 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+ p3_8 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+ p2_8 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+ p1_8 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+ p0_8 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+ q0_8 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+ q1_8 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+ q2_8 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+ q3_8 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
lpf_internal_8_sse2(&p3_8, &q3_8, &p2_8, &q2_8, &p1_8, &q1_8, &p0_8, &q0_8,
&q1q0, &p1p0, &p2, &q2, _blimit, _limit, _thresh);
@@ -1645,8 +1639,8 @@
p1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
p0 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
q0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
- q1 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
- q2 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+ q1 = _mm_setzero_si128();
+ q2 = _mm_setzero_si128();
transpose6x6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &d0d1, &d2d3, &d4d5);
@@ -1684,10 +1678,10 @@
p2_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
p1_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
p0_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
- q0_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 4 * p));
- q1_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 5 * p));
- q2_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 6 * p));
- q3_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 7 * p));
+ q0_8 = _mm_setzero_si128();
+ q1_8 = _mm_setzero_si128();
+ q2_8 = _mm_setzero_si128();
+ q3_8 = _mm_setzero_si128();
transpose8x8_sse2(&p3_8, &p2_8, &p1_8, &p0_8, &q0_8, &q1_8, &q2_8, &q3_8,
&d0d1, &d2d3, &d4d5, &d6d7);
@@ -1747,10 +1741,10 @@
p5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
p4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
p3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
- p2 = _mm_loadl_epi64((__m128i *)((s - 8) + 4 * p));
- p1 = _mm_loadl_epi64((__m128i *)((s - 8) + 5 * p));
- p0 = _mm_loadl_epi64((__m128i *)((s - 8) + 6 * p));
- q0 = _mm_loadl_epi64((__m128i *)((s - 8) + 7 * p));
+ p2 = _mm_setzero_si128();
+ p1 = _mm_setzero_si128();
+ p0 = _mm_setzero_si128();
+ q0 = _mm_setzero_si128();
transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0d1, &d2d3, &d4d5,
&d6d7);
@@ -1759,10 +1753,10 @@
p5_2 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
p4_2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
p3_2 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- p2_2 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
- p1_2 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
- p0_2 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
- q0_2 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
+ p2_2 = _mm_setzero_si128();
+ p1_2 = _mm_setzero_si128();
+ p0_2 = _mm_setzero_si128();
+ q0_2 = _mm_setzero_si128();
transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, &q0_2,
&d0d1_2, &d2d3_2, &d4d5_2, &d6d7_2);