highbd loopfilter dual performance improvement

due to multiple load-store pairs elimination
transpose size decrease and right function usage

Gain details:
aom_highbd_lpf_vertical_4_dual_sse2 -1.3x
aom_highbd_lpf_vertical_8_dual_sse2 -1.4x
aom_highbd_lpf_horizontal_14_dual_sse2 -1.8x
aom_highbd_lpf_vertical_14_dual_sse2 -2.0x

Change-Id: I23a08129f6b8b9a9ee09e344ae0d7d89206ec216
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 41173d9..a6cf267 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -445,15 +445,11 @@
   }
 }
 
-static INLINE void highbd_lpf_horz_edge_8_dual_sse2(uint16_t *s, int pitch,
-                                                    const uint8_t *blt,
-                                                    const uint8_t *lt,
-                                                    const uint8_t *thr,
-                                                    int bd) {
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt, const uint8_t *lt,
+    const uint8_t *thr, int bd) {
   __m128i blimit, limit, thresh;
   get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
-  __m128i p[7], q[7];
-  load_highbd_pixel(s, 7, pitch, p, q);
   __m128i mask;
   highbd_filter_mask(p, q, &limit, &blimit, &mask);
   __m128i flat, flat2;
@@ -605,29 +601,22 @@
     q[i] = _mm_andnot_si128(flat2, q[i]);
     flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
     q[i] = _mm_or_si128(q[i], flat2_q[i]);
-    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
   }
 }
 
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int p,
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch,
                                             const uint8_t *_blimit,
                                             const uint8_t *_limit,
                                             const uint8_t *_thresh, int bd) {
-  aom_highbd_lpf_horizontal_14_sse2(s, p, _blimit, _limit, _thresh, bd);
-  aom_highbd_lpf_horizontal_14_sse2(s + 4, p, _blimit, _limit, _thresh, bd);
-}
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
 
-static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
-                                      const __m128i *p0, const __m128i *q0,
-                                      const __m128i *q1, const __m128i *q2,
-                                      int p, uint16_t *s) {
-  _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit, _limit, _thresh, bd);
+  for (i = 0; i < 6; i++) {
+    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+  }
 }
 
 static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
@@ -1197,23 +1186,63 @@
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+  __m128i d0[2], d1[2], d2[2], d3[2], d4[2], d5[2], d6[2], d7[2];
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  x0[0] = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1[0] = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2[0] = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3[0] = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4[0] = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5[0] = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6[0] = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7[0] = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
 
-  // Transpose back
-  highbd_transpose8x8(src, 16, dst, p, 2);
+  x0[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 0 * p));
+  x1[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 1 * p));
+  x2[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 2 * p));
+  x3[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 3 * p));
+  x4[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 4 * p));
+  x5[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 5 * p));
+  x6[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 6 * p));
+  x7[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 7 * p));
+
+  // highbd_transpose8x16_sse2(x0,x1,x2,x3,x4,x5,x6,x7,
+  // d0,d1,d2,d3,d4,d5,d6,d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+
+  // shift by 4
+  __m128i d2_4, d3_4, d4_4, d5_4;
+  d2_4 = _mm_srli_si128(d2[0], 8);
+  d3_4 = _mm_srli_si128(d3[0], 8);
+  d4_4 = _mm_srli_si128(d4[0], 8);
+  d5_4 = _mm_srli_si128(d5[0], 8);
+
+  highbd_lpf_internal_4_sse2(d2, d3, d4, d5, blimit0, limit0, thresh0, bd);
+  highbd_lpf_internal_4_sse2(&d2_4, &d3_4, &d4_4, &d5_4, blimit1, limit1,
+                             thresh1, bd);
+
+  d2[0] = _mm_unpacklo_epi64(d2[0], d2_4);
+  d3[0] = _mm_unpacklo_epi64(d3[0], d3_4);
+  d4[0] = _mm_unpacklo_epi64(d4[0], d4_4);
+  d5[0] = _mm_unpacklo_epi64(d5[0], d5_4);
+
+  highbd_transpose8x8_sse2(d0, d1, d2, d3, d4, d5, d6, d7, x0, x1, x2, x3, x4,
+                           x5, x6, x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7[0]);
 }
 
 void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
@@ -1291,24 +1320,72 @@
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+  __m128i d0[2], d1[2], d2[2], d3[2], d4[2], d5[2], d6[2], d7[2];
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  x0[0] = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1[0] = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2[0] = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3[0] = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4[0] = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5[0] = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6[0] = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7[0] = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  x0[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 0 * p));
+  x1[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 1 * p));
+  x2[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 2 * p));
+  x3[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 3 * p));
+  x4[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 4 * p));
+  x5[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 5 * p));
+  x6[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 6 * p));
+  x7[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 7 * p));
 
-  // Transpose back
-  highbd_transpose8x8(src, 16, dst, p, 2);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+
+  __m128i d0_4, d1_4, d2_4, d3_4, d4_4, d5_4, d6_4, d7_4;
+  __m128i q1q0[2], p1p0[2], p1p0_lo, p1p0_hi, q1q0_lo, q1q0_hi;
+  d0_4 = _mm_srli_si128(d0[0], 8);
+  d1_4 = _mm_srli_si128(d1[0], 8);
+  d2_4 = _mm_srli_si128(d2[0], 8);
+  d3_4 = _mm_srli_si128(d3[0], 8);
+  d4_4 = _mm_srli_si128(d4[0], 8);
+  d5_4 = _mm_srli_si128(d5[0], 8);
+  d6_4 = _mm_srli_si128(d6[0], 8);
+  d7_4 = _mm_srli_si128(d7[0], 8);
+
+  highbd_lpf_internal_8_sse2(d0, d7, d1, d6, d2, d5, d3, d4, q1q0, p1p0,
+                             blimit0, limit0, thresh0, bd);
+  highbd_lpf_internal_8_sse2(&d0_4, &d7_4, &d1_4, &d6_4, &d2_4, &d5_4, &d3_4,
+                             &d4_4, q1q0 + 1, p1p0 + 1, blimit1, limit1,
+                             thresh1, bd);
+
+  d0[0] = _mm_unpacklo_epi64(d0[0], d0_4);
+  d1[0] = _mm_unpacklo_epi64(d1[0], d1_4);
+  d6[0] = _mm_unpacklo_epi64(d6[0], d6_4);
+  d7[0] = _mm_unpacklo_epi64(d7[0], d7_4);
+
+  p1p0_lo = _mm_unpacklo_epi64(p1p0[0], p1p0[1]);
+  p1p0_hi = _mm_unpackhi_epi64(p1p0[0], p1p0[1]);
+  q1q0_lo = _mm_unpacklo_epi64(q1q0[0], q1q0[1]);
+  q1q0_hi = _mm_unpackhi_epi64(q1q0[0], q1q0[1]);
+
+  highbd_transpose8x8_sse2(d0, d1, &p1p0_hi, &p1p0_lo, &q1q0_lo, &q1q0_hi, d6,
+                           d7, x0, x1, x2, x3, x4, x5, x6, x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6[0]);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7[0]);
 }
 
 void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
@@ -1372,18 +1449,66 @@
   _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
 }
 
-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int p,
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
 
-  //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-  highbd_lpf_horz_edge_8_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh,
-                                   bd);
-  //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit, limit, thresh, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
 }
diff --git a/aom_dsp/x86/lpf_common_sse2.h b/aom_dsp/x86/lpf_common_sse2.h
index 65ea531..e811959 100644
--- a/aom_dsp/x86/lpf_common_sse2.h
+++ b/aom_dsp/x86/lpf_common_sse2.h
@@ -123,6 +123,8 @@
   *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
 }
 
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
 static INLINE void highbd_transpose8x8_sse2(
     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
@@ -132,116 +134,18 @@
   highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
 }
 
-static INLINE void highbd_transpose8x8(uint16_t *src[], int in_p,
-                                       uint16_t *dst[], int out_p,
-                                       int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    uint16_t *in = src[idx8x8];
-    uint16_t *out = dst[idx8x8];
-
-    p0 =
-        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    p1 =
-        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    p2 =
-        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    p3 =
-        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    p4 =
-        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    p5 =
-        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    p6 =
-        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    p7 =
-        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-                                                      // 00 10 01 11 02 12 03 13
-    x0 = _mm_unpacklo_epi16(p0, p1);
-    // 20 30 21 31 22 32 23 33
-    x1 = _mm_unpacklo_epi16(p2, p3);
-    // 40 50 41 51 42 52 43 53
-    x2 = _mm_unpacklo_epi16(p4, p5);
-    // 60 70 61 71 62 72 63 73
-    x3 = _mm_unpacklo_epi16(p6, p7);
-    // 00 10 20 30 01 11 21 31
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 40 50 60 70 41 51 61 71
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 00 10 20 30 40 50 60 70
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 01 11 21 31 41 51 61 71
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
-    // 00 10 20 30 40 50 60 70
-    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
-    // 01 11 21 31 41 51 61 71
-
-    // 02 12 22 32 03 13 23 33
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 42 52 62 72 43 53 63 73
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 02 12 22 32 42 52 62 72
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
-    // 02 12 22 32 42 52 62 72
-    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
-    // 03 13 23 33 43 53 63 73
-
-    // 04 14 05 15 06 16 07 17
-    x0 = _mm_unpackhi_epi16(p0, p1);
-    // 24 34 25 35 26 36 27 37
-    x1 = _mm_unpackhi_epi16(p2, p3);
-    // 44 54 45 55 46 56 47 57
-    x2 = _mm_unpackhi_epi16(p4, p5);
-    // 64 74 65 75 66 76 67 77
-    x3 = _mm_unpackhi_epi16(p6, p7);
-    // 04 14 24 34 05 15 25 35
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 44 54 64 74 45 55 65 75
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 04 14 24 34 44 54 64 74
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 05 15 25 35 45 55 65 75
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
-    // 04 14 24 34 44 54 64 74
-    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
-    // 05 15 25 35 45 55 65 75
-
-    // 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 06 16 26 36 46 56 66 76
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
-    // 06 16 26 36 46 56 66 76
-    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
-    // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
 }
 
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
-                                        uint16_t *out, int out_p) {
-  uint16_t *src0[1];
-  uint16_t *src1[1];
-  uint16_t *dest0[1];
-  uint16_t *dest1[1];
-  src0[0] = in0;
-  src1[0] = in1;
-  dest0[0] = out;
-  dest1[0] = out + 8;
-  highbd_transpose8x8(src0, in_p, dest0, out_p, 1);
-  highbd_transpose8x8(src1, in_p, dest1, out_p, 1);
-}
 #endif  // _AOM_DSP_X86_LPF_COMMON_X86_H