highbd loopfilter dual performance improvement
due to multiple load-store pairs elimination
transpose size decrease and right function usage
Gain details:
aom_highbd_lpf_vertical_4_dual_sse2 -1.3x
aom_highbd_lpf_vertical_8_dual_sse2 -1.4x
aom_highbd_lpf_horizontal_14_dual_sse2 -1.8x
aom_highbd_lpf_vertical_14_dual_sse2 -2.0x
Change-Id: I23a08129f6b8b9a9ee09e344ae0d7d89206ec216
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index 41173d9..a6cf267 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -445,15 +445,11 @@
}
}
-static INLINE void highbd_lpf_horz_edge_8_dual_sse2(uint16_t *s, int pitch,
- const uint8_t *blt,
- const uint8_t *lt,
- const uint8_t *thr,
- int bd) {
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+ __m128i *p, __m128i *q, const uint8_t *blt, const uint8_t *lt,
+ const uint8_t *thr, int bd) {
__m128i blimit, limit, thresh;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
- __m128i p[7], q[7];
- load_highbd_pixel(s, 7, pitch, p, q);
__m128i mask;
highbd_filter_mask(p, q, &limit, &blimit, &mask);
__m128i flat, flat2;
@@ -605,29 +601,22 @@
q[i] = _mm_andnot_si128(flat2, q[i]);
flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
q[i] = _mm_or_si128(q[i], flat2_q[i]);
- _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
- _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
}
}
-void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int p,
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch,
const uint8_t *_blimit,
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
- aom_highbd_lpf_horizontal_14_sse2(s, p, _blimit, _limit, _thresh, bd);
- aom_highbd_lpf_horizontal_14_sse2(s + 4, p, _blimit, _limit, _thresh, bd);
-}
+ __m128i p[7], q[7];
+ int i;
+ load_highbd_pixel(s, 7, pitch, p, q);
-static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
- const __m128i *p0, const __m128i *q0,
- const __m128i *q1, const __m128i *q2,
- int p, uint16_t *s) {
- _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
- _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
+ highbd_lpf_internal_14_dual_sse2(p, q, _blimit, _limit, _thresh, bd);
+ for (i = 0; i < 6; i++) {
+ _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+ }
}
static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
@@ -1197,23 +1186,63 @@
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
- DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
- uint16_t *src[2];
- uint16_t *dst[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+ __m128i d0[2], d1[2], d2[2], d3[2], d4[2], d5[2], d6[2], d7[2];
- // Transpose 8x16
- highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ x0[0] = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+ x1[0] = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+ x2[0] = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+ x3[0] = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+ x4[0] = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+ x5[0] = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+ x6[0] = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+ x7[0] = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
- // Loop filtering
- aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
- thresh0, blimit1, limit1, thresh1, bd);
- src[0] = t_dst;
- src[1] = t_dst + 8;
- dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+ d5, d6, d7);
- // Transpose back
- highbd_transpose8x8(src, 16, dst, p, 2);
+ x0[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 0 * p));
+ x1[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 1 * p));
+ x2[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 2 * p));
+ x3[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 3 * p));
+ x4[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 4 * p));
+ x5[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 5 * p));
+ x6[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 6 * p));
+ x7[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 7 * p));
+
+ // highbd_transpose8x16_sse2(x0,x1,x2,x3,x4,x5,x6,x7,
+ // d0,d1,d2,d3,d4,d5,d6,d7);
+ highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+ x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+ d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+
+ // shift by 4
+ __m128i d2_4, d3_4, d4_4, d5_4;
+ d2_4 = _mm_srli_si128(d2[0], 8);
+ d3_4 = _mm_srli_si128(d3[0], 8);
+ d4_4 = _mm_srli_si128(d4[0], 8);
+ d5_4 = _mm_srli_si128(d5[0], 8);
+
+ highbd_lpf_internal_4_sse2(d2, d3, d4, d5, blimit0, limit0, thresh0, bd);
+ highbd_lpf_internal_4_sse2(&d2_4, &d3_4, &d4_4, &d5_4, blimit1, limit1,
+ thresh1, bd);
+
+ d2[0] = _mm_unpacklo_epi64(d2[0], d2_4);
+ d3[0] = _mm_unpacklo_epi64(d3[0], d3_4);
+ d4[0] = _mm_unpacklo_epi64(d4[0], d4_4);
+ d5[0] = _mm_unpacklo_epi64(d5[0], d5_4);
+
+ highbd_transpose8x8_sse2(d0, d1, d2, d3, d4, d5, d6, d7, x0, x1, x2, x3, x4,
+ x5, x6, x7);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7[0]);
}
void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
@@ -1291,24 +1320,72 @@
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
- DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
- uint16_t *src[2];
- uint16_t *dst[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+ __m128i d0[2], d1[2], d2[2], d3[2], d4[2], d5[2], d6[2], d7[2];
- // Transpose 8x16
- highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ x0[0] = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+ x1[0] = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+ x2[0] = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+ x3[0] = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+ x4[0] = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+ x5[0] = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+ x6[0] = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+ x7[0] = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
- // Loop filtering
- aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
- thresh0, blimit1, limit1, thresh1, bd);
- src[0] = t_dst;
- src[1] = t_dst + 8;
+ highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+ d5, d6, d7);
- dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ x0[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 0 * p));
+ x1[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 1 * p));
+ x2[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 2 * p));
+ x3[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 3 * p));
+ x4[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 4 * p));
+ x5[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 5 * p));
+ x6[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 6 * p));
+ x7[1] = _mm_loadu_si128((__m128i *)(s - 4 + p * 8 + 7 * p));
- // Transpose back
- highbd_transpose8x8(src, 16, dst, p, 2);
+ highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+ x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+ d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+
+ __m128i d0_4, d1_4, d2_4, d3_4, d4_4, d5_4, d6_4, d7_4;
+ __m128i q1q0[2], p1p0[2], p1p0_lo, p1p0_hi, q1q0_lo, q1q0_hi;
+ d0_4 = _mm_srli_si128(d0[0], 8);
+ d1_4 = _mm_srli_si128(d1[0], 8);
+ d2_4 = _mm_srli_si128(d2[0], 8);
+ d3_4 = _mm_srli_si128(d3[0], 8);
+ d4_4 = _mm_srli_si128(d4[0], 8);
+ d5_4 = _mm_srli_si128(d5[0], 8);
+ d6_4 = _mm_srli_si128(d6[0], 8);
+ d7_4 = _mm_srli_si128(d7[0], 8);
+
+ highbd_lpf_internal_8_sse2(d0, d7, d1, d6, d2, d5, d3, d4, q1q0, p1p0,
+ blimit0, limit0, thresh0, bd);
+ highbd_lpf_internal_8_sse2(&d0_4, &d7_4, &d1_4, &d6_4, &d2_4, &d5_4, &d3_4,
+ &d4_4, q1q0 + 1, p1p0 + 1, blimit1, limit1,
+ thresh1, bd);
+
+ d0[0] = _mm_unpacklo_epi64(d0[0], d0_4);
+ d1[0] = _mm_unpacklo_epi64(d1[0], d1_4);
+ d6[0] = _mm_unpacklo_epi64(d6[0], d6_4);
+ d7[0] = _mm_unpacklo_epi64(d7[0], d7_4);
+
+ p1p0_lo = _mm_unpacklo_epi64(p1p0[0], p1p0[1]);
+ p1p0_hi = _mm_unpackhi_epi64(p1p0[0], p1p0[1]);
+ q1q0_lo = _mm_unpacklo_epi64(q1q0[0], q1q0[1]);
+ q1q0_hi = _mm_unpackhi_epi64(q1q0[0], q1q0[1]);
+
+ highbd_transpose8x8_sse2(d0, d1, &p1p0_hi, &p1p0_lo, &q1q0_lo, &q1q0_hi, d6,
+ d7, x0, x1, x2, x3, x4, x5, x6, x7);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6[0]);
+ _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7[0]);
}
void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
@@ -1372,18 +1449,66 @@
_mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
}
-void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int p,
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+ __m128i q[7], p[7];
+ __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+ __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+ __m128i d0, d7;
+ __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
- // Transpose 16x16
- highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
- highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
- highbd_lpf_horz_edge_8_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh,
- bd);
- // Transpose back
- highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
- highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+ p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+ &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+ p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+ p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+ &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+ &q[6], &d7);
+
+ highbd_lpf_internal_14_dual_sse2(p, q, blimit, limit, thresh, bd);
+
+ highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+ highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
}
diff --git a/aom_dsp/x86/lpf_common_sse2.h b/aom_dsp/x86/lpf_common_sse2.h
index 65ea531..e811959 100644
--- a/aom_dsp/x86/lpf_common_sse2.h
+++ b/aom_dsp/x86/lpf_common_sse2.h
@@ -123,6 +123,8 @@
*d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
}
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
static INLINE void highbd_transpose8x8_sse2(
__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
__m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
@@ -132,116 +134,18 @@
highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
}
-static INLINE void highbd_transpose8x8(uint16_t *src[], int in_p,
- uint16_t *dst[], int out_p,
- int num_8x8_to_transpose) {
- int idx8x8 = 0;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
- do {
- uint16_t *in = src[idx8x8];
- uint16_t *out = dst[idx8x8];
-
- p0 =
- _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
- p1 =
- _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
- p2 =
- _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
- p3 =
- _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
- p4 =
- _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
- p5 =
- _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
- p6 =
- _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
- p7 =
- _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
- // 00 10 01 11 02 12 03 13
- x0 = _mm_unpacklo_epi16(p0, p1);
- // 20 30 21 31 22 32 23 33
- x1 = _mm_unpacklo_epi16(p2, p3);
- // 40 50 41 51 42 52 43 53
- x2 = _mm_unpacklo_epi16(p4, p5);
- // 60 70 61 71 62 72 63 73
- x3 = _mm_unpacklo_epi16(p6, p7);
- // 00 10 20 30 01 11 21 31
- x4 = _mm_unpacklo_epi32(x0, x1);
- // 40 50 60 70 41 51 61 71
- x5 = _mm_unpacklo_epi32(x2, x3);
- // 00 10 20 30 40 50 60 70
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 01 11 21 31 41 51 61 71
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
- // 00 10 20 30 40 50 60 70
- _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
- // 01 11 21 31 41 51 61 71
-
- // 02 12 22 32 03 13 23 33
- x4 = _mm_unpackhi_epi32(x0, x1);
- // 42 52 62 72 43 53 63 73
- x5 = _mm_unpackhi_epi32(x2, x3);
- // 02 12 22 32 42 52 62 72
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
- // 02 12 22 32 42 52 62 72
- _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
- // 03 13 23 33 43 53 63 73
-
- // 04 14 05 15 06 16 07 17
- x0 = _mm_unpackhi_epi16(p0, p1);
- // 24 34 25 35 26 36 27 37
- x1 = _mm_unpackhi_epi16(p2, p3);
- // 44 54 45 55 46 56 47 57
- x2 = _mm_unpackhi_epi16(p4, p5);
- // 64 74 65 75 66 76 67 77
- x3 = _mm_unpackhi_epi16(p6, p7);
- // 04 14 24 34 05 15 25 35
- x4 = _mm_unpacklo_epi32(x0, x1);
- // 44 54 64 74 45 55 65 75
- x5 = _mm_unpacklo_epi32(x2, x3);
- // 04 14 24 34 44 54 64 74
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 05 15 25 35 45 55 65 75
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
- // 04 14 24 34 44 54 64 74
- _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
- // 05 15 25 35 45 55 65 75
-
- // 06 16 26 36 07 17 27 37
- x4 = _mm_unpackhi_epi32(x0, x1);
- // 46 56 66 76 47 57 67 77
- x5 = _mm_unpackhi_epi32(x2, x3);
- // 06 16 26 36 46 56 66 76
- x6 = _mm_unpacklo_epi64(x4, x5);
- // 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi64(x4, x5);
-
- _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
- // 06 16 26 36 46 56 66 76
- _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
- // 07 17 27 37 47 57 67 77
- } while (++idx8x8 < num_8x8_to_transpose);
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+ d5, d6, d7);
+ highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+ x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+ d4 + 1, d5 + 1, d6 + 1, d7 + 1);
}
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
- uint16_t *out, int out_p) {
- uint16_t *src0[1];
- uint16_t *src1[1];
- uint16_t *dest0[1];
- uint16_t *dest1[1];
- src0[0] = in0;
- src1[0] = in1;
- dest0[0] = out;
- dest1[0] = out + 8;
- highbd_transpose8x8(src0, in_p, dest0, out_p, 1);
- highbd_transpose8x8(src1, in_p, dest1, out_p, 1);
-}
#endif // _AOM_DSP_X86_LPF_COMMON_X86_H