Read only required pixels in loop-filter intrinsics

Loop-filter intrinsics are modified to read only
the required pixels. Reading extra pixels causes
data race when loop-filter is multi-threaded.

Change-Id: I02935e6a87cec8ef2a347d66f72cb8169d04a67a
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 739840d..95d8a7d 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -454,7 +454,11 @@
                                        const uint8_t *thr, int bd) {
   __m128i p[7], q[7], pq[7];
   int i;
-  load_highbd_pixel(s, 7, pitch, p, q);
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
 
   highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
 
@@ -1355,10 +1359,10 @@
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh, int bd) {
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
 
   highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, _blimit, _limit, _thresh, bd);
 
@@ -1395,10 +1399,10 @@
   x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
   x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
   x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+  x4 = _mm_setzero_si128();
+  x5 = _mm_setzero_si128();
+  x6 = _mm_setzero_si128();
+  x7 = _mm_setzero_si128();
 
   highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
                            &d2, &d3, &d4, &d5, &d6, &d7);
@@ -1461,8 +1465,8 @@
   p1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
   p0 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
   q0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-  q1 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
-  q2 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  q1 = _mm_setzero_si128();
+  q2 = _mm_setzero_si128();
 
   highbd_transpose6x6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &d0, &d1, &d2, &d3,
                            &d4, &d5);
@@ -1497,10 +1501,10 @@
   p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
   p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
   p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-  q0 = _mm_loadu_si128((__m128i *)((s - 4) + 4 * p));
-  q1 = _mm_loadu_si128((__m128i *)((s - 4) + 5 * p));
-  q2 = _mm_loadu_si128((__m128i *)((s - 4) + 6 * p));
-  q3 = _mm_loadu_si128((__m128i *)((s - 4) + 7 * p));
+  q0 = _mm_setzero_si128();
+  q1 = _mm_setzero_si128();
+  q2 = _mm_setzero_si128();
+  q3 = _mm_setzero_si128();
 
   highbd_transpose8x8_sse2(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3, &d0, &d1,
                            &d2, &d3, &d4, &d5, &d6, &d7);
@@ -1571,10 +1575,10 @@
   p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
   p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
   p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
-  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
-  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
-  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+  p2 = _mm_setzero_si128();
+  p1 = _mm_setzero_si128();
+  p0 = _mm_setzero_si128();
+  q0 = _mm_setzero_si128();
 
   highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
                            &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
@@ -1583,10 +1587,10 @@
   p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
   p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
   p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
-  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
-  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
-  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+  p2_2 = _mm_setzero_si128();
+  p1_2 = _mm_setzero_si128();
+  p0_2 = _mm_setzero_si128();
+  q0_2 = _mm_setzero_si128();
 
   highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
                            &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 87a2d57..3450dca 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -151,10 +151,10 @@
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
   __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 0 * p)));
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
   FILTER_HEV_MASK4;
@@ -188,14 +188,8 @@
   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
                          _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
 
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
+  x2 = _mm_setzero_si128();
+  x3 = _mm_setzero_si128();
   // Transpose 8x8
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
@@ -575,23 +569,23 @@
                                 const unsigned char *_thresh) {
   __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
 
-  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
 
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
 
-  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+  q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
 
-  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+  q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
 
   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, _blimit,
                        _limit, _thresh);
@@ -776,12 +770,12 @@
   __m128i p2, p1, p0, q0, q1, q2;
   __m128i p1p0, q1q0;
 
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
 
   lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, _blimit,
                       _limit, _thresh);
@@ -953,14 +947,14 @@
                                const unsigned char *_thresh) {
   __m128i p2_8, p1_8, p0_8, q0_8, q1_8, q2_8, p3_8, q3_8;
   __m128i q1q0, p1p0, p2, q2;
-  p3_8 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2_8 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1_8 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0_8 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0_8 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1_8 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2_8 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3_8 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p3_8 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+  p2_8 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1_8 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0_8 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0_8 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1_8 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2_8 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  q3_8 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
 
   lpf_internal_8_sse2(&p3_8, &q3_8, &p2_8, &q2_8, &p1_8, &q1_8, &p0_8, &q0_8,
                       &q1q0, &p1p0, &p2, &q2, _blimit, _limit, _thresh);
@@ -1645,8 +1639,8 @@
   p1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
   p0 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
   q0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-  q1 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
-  q2 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  q1 = _mm_setzero_si128();
+  q2 = _mm_setzero_si128();
 
   transpose6x6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &d0d1, &d2d3, &d4d5);
 
@@ -1684,10 +1678,10 @@
   p2_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
   p1_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
   p0_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
-  q0_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 4 * p));
-  q1_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 5 * p));
-  q2_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 6 * p));
-  q3_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 7 * p));
+  q0_8 = _mm_setzero_si128();
+  q1_8 = _mm_setzero_si128();
+  q2_8 = _mm_setzero_si128();
+  q3_8 = _mm_setzero_si128();
 
   transpose8x8_sse2(&p3_8, &p2_8, &p1_8, &p0_8, &q0_8, &q1_8, &q2_8, &q3_8,
                     &d0d1, &d2d3, &d4d5, &d6d7);
@@ -1747,10 +1741,10 @@
   p5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
   p4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
   p3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
-  p2 = _mm_loadl_epi64((__m128i *)((s - 8) + 4 * p));
-  p1 = _mm_loadl_epi64((__m128i *)((s - 8) + 5 * p));
-  p0 = _mm_loadl_epi64((__m128i *)((s - 8) + 6 * p));
-  q0 = _mm_loadl_epi64((__m128i *)((s - 8) + 7 * p));
+  p2 = _mm_setzero_si128();
+  p1 = _mm_setzero_si128();
+  p0 = _mm_setzero_si128();
+  q0 = _mm_setzero_si128();
 
   transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0d1, &d2d3, &d4d5,
                     &d6d7);
@@ -1759,10 +1753,10 @@
   p5_2 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
   p4_2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
   p3_2 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  p2_2 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
-  p1_2 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
-  p0_2 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
-  q0_2 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
+  p2_2 = _mm_setzero_si128();
+  p1_2 = _mm_setzero_si128();
+  p0_2 = _mm_setzero_si128();
+  q0_2 = _mm_setzero_si128();
 
   transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, &q0_2,
                     &d0d1_2, &d2d3_2, &d4d5_2, &d6d7_2);