lpf sse2 4,8 single and dual improvements
dual 4 and 8 horizontal and vertical fn 2x performance
dual 4 and 8 tests added
single and dual 4,8 code quality improved
Change-Id: If3e09d6a07585cc8cf336a946cca8152f3998abb
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6a6de5a..637e1b5 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -382,11 +382,13 @@
specialize qw/aom_lpf_vertical_8 sse2/;
add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2/;
add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_4 sse2/;
add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2/;
add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_horizontal_14 sse2/;
@@ -401,11 +403,13 @@
specialize qw/aom_lpf_horizontal_8 sse2/;
add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2/;
add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_horizontal_4 sse2/;
add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2/;
add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_14 sse2/;
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index 23eeab6..b2a8e26 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -21,41 +21,267 @@
return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK4 \
- do { \
- /* (abs(q1 - q0), abs(p1 - p0) */ \
- __m128i flat = abs_diff(q1p1, q0p0); \
- /* abs(p1 - q1), abs(p0 - q0) */ \
- const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
- __m128i abs_p0q0, abs_p1q1; \
- \
- /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
- hev = \
- _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
- hev = _mm_cmpgt_epi16(hev, thresh); \
- hev = _mm_packs_epi16(hev, hev); \
- \
- /* const int8_t mask = filter_mask2(*limit, *blimit, */ \
- /* p1, p0, q0, q1); */ \
- abs_p0q0 = \
- _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
- abs_p1q1 = \
- _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
- abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
- mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
- mask = _mm_unpacklo_epi64(mask, flat); \
- mask = _mm_subs_epu8(mask, limit); \
- mask = _mm_cmpeq_epi8(mask, zero); \
- mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
- } while (0)
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev,
- __m128i *mask, __m128i *qs1qs0,
- __m128i *ps1ps0) {
+ __m128i w0, w1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ *d0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+ *d1 = _mm_srli_si128(*d0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4,
+ __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+ *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1,
+ 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1,
+ 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1,
+ 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose6x6_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *d0d1, __m128i *d2d3,
+ __m128i *d4d5) {
+ __m128i w0, w1, w2, w4, w5;
+ // x0 00 01 02 03 04 05 xx xx
+ // x1 10 11 12 13 14 15 xx xx
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+
+ // x2 20 21 22 23 24 25
+ // x3 30 31 32 33 34 35
+
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+
+ // x4 40 41 42 43 44 45
+ // x5 50 51 52 53 54 55
+
+ w2 = _mm_unpacklo_epi8(*x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w0); // 40 50 xx xx 41 51 xx xx 42 52 xx xx 43 53 xx xx
+
+ *d0d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 xx xx 01 11 21 31 41 51 xx xx
+
+ *d2d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 xx xx 03 13 23 33 43 53 xx xx
+
+ w4 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 xx xx xx xx xx xx xx xx
+ w5 = _mm_unpackhi_epi16(
+ w2, *x3); // 44 54 xx xx 45 55 xx xx xx xx xx xx xx xx xx xx
+ *d4d5 = _mm_unpacklo_epi32(
+ w4, w5); // 04 14 24 34 44 54 xx xx 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0,
+ __m128i *d1, __m128i *d2,
+ __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, w2, w3, w4, w5;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d1 = _mm_srli_si128(*d0, 8);
+ *d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0d1,
+ __m128i *d2d3, __m128i *d4d5,
+ __m128i *d6d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d2d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w6 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ w7 = _mm_unpackhi_epi16(
+ w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+ *d4d5 = _mm_unpacklo_epi32(
+ w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ *d6d7 = _mm_unpackhi_epi32(
+ w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+// filter_mask and hev_mask
+static AOM_FORCE_INLINE void filter_hev_mask4(__m128i *p1p0, __m128i *q1q0,
+ __m128i *q1p1, __m128i *q0p0,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh,
+ __m128i *hev, __m128i *mask) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i limit =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ __m128i flat = abs_diff(*q1p1, *q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ const __m128i abs_p1q1p0q0 = abs_diff(*p1p0, *q1q0);
+ __m128i abs_p0q0, abs_p1q1;
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ *hev = _mm_unpacklo_epi8(flat, zero);
+
+ *hev = _mm_cmpgt_epi16(*hev, thresh);
+ *hev = _mm_packs_epi16(*hev, *hev);
+
+ /* const int8_t mask = filter_mask2(*limit, *blimit, */
+ /* p1, p0, q0, q1); */
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+ *mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ *mask = _mm_unpacklo_epi64(*mask, flat);
+ *mask = _mm_subs_epu8(*mask, limit);
+ *mask = _mm_cmpeq_epi8(*mask, zero);
+ *mask = _mm_and_si128(*mask, _mm_srli_si128(*mask, 8));
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0, __m128i *ps1ps0) {
const __m128i t3t4 =
_mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
const __m128i t80 = _mm_set1_epi8(0x80);
@@ -103,25 +329,66 @@
*ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
}
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
+ __m128i mask, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ __m128i flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+
+ /* const int8_t mask = filter_mask2(*limit, *blimit, */
+ /* p1, p0, q0, q1); */
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi64(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_setzero_si128();
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
+ __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
- __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i mask, hev;
- q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 0 * p)));
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
- FILTER_HEV_MASK4;
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+ __m128i qs1qs0, ps1ps0;
+ __m128i p1, p0, q0, q1;
+
+ p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+ p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+ q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
+ q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
xx_storel_32(s - 1 * p, ps1ps0);
xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
@@ -132,65 +399,36 @@
void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
+ __m128i p1p0, q1q0;
+ __m128i p1, p0, q0, q1;
+
const __m128i zero = _mm_setzero_si128();
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
+ __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
__m128i x0, x1, x2, x3;
- __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i mask, hev;
+ __m128i d0, d1, d2, d3;
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
- // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+ transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
- // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
- x2 = _mm_setzero_si128();
- x3 = _mm_setzero_si128();
- // Transpose 8x8
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- p1p0 = _mm_unpacklo_epi16(q1q0, x1);
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- x0 = _mm_unpacklo_epi16(x2, x3);
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- p1p0 = _mm_unpackhi_epi32(p1p0, x0);
- p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
-
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- q1q0 = _mm_unpackhi_epi16(q1q0, x1);
- // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- x2 = _mm_unpackhi_epi16(x2, x3);
- // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
- q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
- q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- FILTER_HEV_MASK4;
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
// Transpose 8x4 to 4x8
- // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
- // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
- // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
- ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
- // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
- x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
- // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
- ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
- xx_storel_32(s + 0 * p - 2, ps1ps0);
- xx_storel_32(s + 1 * p - 2, _mm_srli_si128(ps1ps0, 4));
- xx_storel_32(s + 2 * p - 2, _mm_srli_si128(ps1ps0, 8));
- xx_storel_32(s + 3 * p - 2, _mm_srli_si128(ps1ps0, 12));
+ transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+ xx_storel_32(s + 0 * p - 2, d0);
+ xx_storel_32(s + 1 * p - 2, d1);
+ xx_storel_32(s + 2 * p - 2, d2);
+ xx_storel_32(s + 3 * p - 2, d3);
}
static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
@@ -720,25 +958,22 @@
}
static AOM_FORCE_INLINE void lpf_internal_8_sse2(
- __m128i *p3_8, __m128i *q3_8, __m128i *p2_8, __m128i *q2_8, __m128i *p1_8,
- __m128i *q1_8, __m128i *p0_8, __m128i *q0_8, __m128i *q1q0_out,
- __m128i *p1p0_out, __m128i *p2_out, __m128i *q2_out,
- const unsigned char *_blimit, const unsigned char *_limit,
- const unsigned char *_thresh) {
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
const __m128i zero = _mm_setzero_si128();
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
__m128i mask, hev, flat;
- __m128i p2, q2, p1, p0, q0, q1, p3, q3, q3p3, flat_p1p0, flat_q0q1;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
__m128i q2p2, q1p1, q0p0, p1q1, p0q0;
__m128i q1q0, p1p0, ps1ps0, qs1qs0;
__m128i work_a, op2, oq2;
- q3p3 = _mm_unpacklo_epi64(*p3_8, *q3_8);
- q2p2 = _mm_unpacklo_epi64(*p2_8, *q2_8);
- q1p1 = _mm_unpacklo_epi64(*p1_8, *q1_8);
- q0p0 = _mm_unpacklo_epi64(*p0_8, *q0_8);
+ q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
p1q1 = _mm_shuffle_epi32(q1p1, _MM_SHUFFLE(1, 0, 3, 2));
p0q0 = _mm_shuffle_epi32(q0p0, _MM_SHUFFLE(1, 0, 3, 2));
@@ -764,14 +999,14 @@
abs_p0q0 = abs_diff(q0p0, p0q0);
abs_p1q1 = abs_diff(q1p1, p1q1);
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_subs_epu8(flat, *thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
// replicate for the further "merged variables" usage
hev = _mm_unpacklo_epi64(hev, hev);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(abs_p1p0, mask);
@@ -782,7 +1017,7 @@
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
// replicate for the further "merged variables" usage
mask = _mm_unpacklo_epi64(mask, mask);
@@ -805,48 +1040,49 @@
const __m128i four = _mm_set1_epi16(4);
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
- p2 = _mm_unpacklo_epi8(*p2_8, zero);
- p1 = _mm_unpacklo_epi8(*p1_8, zero);
- p0 = _mm_unpacklo_epi8(*p0_8, zero);
- q0 = _mm_unpacklo_epi8(*q0_8, zero);
- q1 = _mm_unpacklo_epi8(*q1_8, zero);
- q2 = _mm_unpacklo_epi8(*q2_8, zero);
- p3 = _mm_unpacklo_epi8(*p3_8, zero);
- q3 = _mm_unpacklo_epi8(*q3_8, zero);
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
// op2
- workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
// op1
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
// op0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
// oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
// oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
// oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
}
@@ -865,39 +1101,43 @@
p1p0 = _mm_and_si128(flat, flat_p1p0);
*p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- work_a = _mm_andnot_si128(flat, *q2_8);
- q2 = _mm_and_si128(flat, oq2);
- *q2_out = _mm_or_si128(work_a, q2);
+ work_a = _mm_andnot_si128(flat, *q2);
+ q2_16 = _mm_and_si128(flat, oq2);
+ *q2_out = _mm_or_si128(work_a, q2_16);
- work_a = _mm_andnot_si128(flat, *p2_8);
- p2 = _mm_and_si128(flat, op2);
- *p2_out = _mm_or_si128(work_a, p2);
+ work_a = _mm_andnot_si128(flat, *p2);
+ p2_16 = _mm_and_si128(flat, op2);
+ *p2_out = _mm_or_si128(work_a, p2_16);
}
void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- __m128i p2_8, p1_8, p0_8, q0_8, q1_8, q2_8, p3_8, q3_8;
- __m128i q1q0, p1p0, p2, q2;
- p3_8 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
- p2_8 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
- p1_8 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
- p0_8 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
- q0_8 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
- q1_8 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
- q2_8 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
- q3_8 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0, p2_out, q2_out;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- lpf_internal_8_sse2(&p3_8, &q3_8, &p2_8, &q2_8, &p1_8, &q1_8, &p0_8, &q0_8,
- &q1q0, &p1p0, &p2, &q2, _blimit, _limit, _thresh);
+ p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+ p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+ p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+ p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+ q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+ q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+ q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+ q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+
+ lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &p2_out, &q2_out, &blimit, &limit, &thresh);
xx_storel_32(s - 1 * p, p1p0);
xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
xx_storel_32(s + 0 * p, q1q0);
xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
- xx_storel_32(s - 3 * p, p2);
- xx_storel_32(s + 2 * p, q2);
+ xx_storel_32(s - 3 * p, p2_out);
+ xx_storel_32(s + 2 * p, q2_out);
}
void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
@@ -917,240 +1157,36 @@
const uint8_t *_blimit1,
const uint8_t *_limit1,
const uint8_t *_thresh1) {
- DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- const __m128i zero = _mm_setzero_si128();
- const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
_mm_load_si128((const __m128i *)_blimit1));
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
- const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
_mm_load_si128((const __m128i *)_thresh1));
- __m128i mask, hev, flat;
- __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0, p2_out, q2_out;
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- {
- const __m128i abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 =
- _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
- __m128i work;
+ p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- // filter_mask and hev_mask
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &p2_out, &q2_out, &blimit, &limit, &thresh);
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // flat_mask4
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- }
- {
- const __m128i four = _mm_set1_epi16(4);
- unsigned char *src = s;
- int i = 0;
-
- do {
- __m128i workp_a, workp_b, workp_shft;
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
- workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- src += 8;
- } while (++i < 2);
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- const __m128i ps1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
- const __m128i ps0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
- const __m128i qs0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
- const __m128i qs1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (aom_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
-
- filt = _mm_andnot_si128(hev, filt);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q0 = _mm_load_si128((__m128i *)flat_oq0);
- work_a = _mm_andnot_si128(flat, work_a);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- q1 = _mm_load_si128((__m128i *)flat_oq1);
- work_a = _mm_andnot_si128(flat, work_a);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
-
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q2 = _mm_load_si128((__m128i *)flat_oq2);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p0 = _mm_load_si128((__m128i *)flat_op0);
- work_a = _mm_andnot_si128(flat, work_a);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- p1 = _mm_load_si128((__m128i *)flat_op1);
- work_a = _mm_andnot_si128(flat, work_a);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
-
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p2 = _mm_load_si128((__m128i *)flat_op2);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
-
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
- }
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
}
void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1160,115 +1196,38 @@
const unsigned char *_blimit1,
const unsigned char *_limit1,
const unsigned char *_thresh1) {
+ __m128i p1, p0, q0, q1;
+ __m128i qs1qs0, ps1ps0;
+
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+ const __m128i zero = _mm_setzero_si128();
const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
_mm_load_si128((const __m128i *)_blimit1));
const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
_mm_load_si128((const __m128i *)_limit1));
- const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
- const __m128i zero = _mm_setzero_si128();
- __m128i p1, p0, q0, q1;
- __m128i mask, hev, flat;
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- // filter_mask and hev_mask
- {
- const __m128i abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 =
- _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
- // filter4
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
- const __m128i ps1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
- const __m128i ps0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
- const __m128i qs0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
- const __m128i qs1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (aom_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
-
- filt = _mm_andnot_si128(hev, filt);
-
- q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- }
+ _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
}
static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
@@ -1341,224 +1300,63 @@
_mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
}
-#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
-#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
-#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
-#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
-#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
-enum { ROTATE_DWORD_RIGHT = 0x39 };
-static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
- const uint8_t *pSrc,
- const ptrdiff_t srcStride) {
- for (uint32_t idx = 0; idx < 2; idx += 1) {
- __m128i r0, r1, r2, r3;
- // load data
- r0 = movq(pSrc);
- r1 = movq(pSrc + srcStride);
- r2 = movq(pSrc + srcStride * 2);
- r3 = movq(pSrc + srcStride * 3);
- // transpose
- r0 = punpcklbw(r0, r1);
- r2 = punpcklbw(r2, r3);
- r1 = punpckhwd(r0, r2);
- r0 = punpcklwd(r0, r2);
- // store data
- xx_storel_32(pDst, r0);
- r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride, r0);
- r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride * 2, r0);
- r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride * 3, r0);
- xx_storel_32(pDst + dstStride * 4, r1);
- r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride * 5, r1);
- r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride * 6, r1);
- r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
- xx_storel_32(pDst + dstStride * 7, r1);
- // advance the pointers
- pDst += dstStride * 8;
- pSrc += 8;
- }
-}
-
-static INLINE void transpose6x6_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *d0d1, __m128i *d2d3,
- __m128i *d4d5) {
- __m128i w0, w1, w2, w4, w5;
- // x0 00 01 02 03 04 05 xx xx
- // x1 10 11 12 13 14 15 xx xx
-
- w0 = _mm_unpacklo_epi8(*x0, *x1);
- // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
-
- // x2 20 21 22 23 24 25
- // x3 30 31 32 33 34 35
-
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
-
- // x4 40 41 42 43 44 45
- // x5 50 51 52 53 54 55
-
- w2 = _mm_unpacklo_epi8(*x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w0); // 40 50 xx xx 41 51 xx xx 42 52 xx xx 43 53 xx xx
-
- *d0d1 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 xx xx 01 11 21 31 41 51 xx xx
-
- *d2d3 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 xx xx 03 13 23 33 43 53 xx xx
-
- w4 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 xx xx xx xx xx xx xx xx
- w5 = _mm_unpackhi_epi16(
- w2, *x3); // 44 54 xx xx 45 55 xx xx xx xx xx xx xx xx xx xx
- *d4d5 = _mm_unpacklo_epi32(
- w4, w5); // 04 14 24 34 44 54 xx xx 05 15 25 35 45 55 xx xx
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7, __m128i *d0d1,
- __m128i *d2d3, __m128i *d4d5,
- __m128i *d6d7) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7;
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- *d0d1 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- *d2d3 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
- w6 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- w7 = _mm_unpackhi_epi16(
- w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
- *d4d5 = _mm_unpacklo_epi32(
- w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- *d6d7 = _mm_unpackhi_epi32(
- w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
-
-static INLINE void transpose8x8(unsigned char *src[], int in_p,
- unsigned char *dst[], int out_p,
- int num_8x8_to_transpose) {
- int idx8x8 = 0;
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i p0, q0, q1, p1;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
- do {
- unsigned char *in = src[idx8x8];
- unsigned char *out = dst[idx8x8];
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i qs1qs0, ps1ps0;
- x0 =
- _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
- x1 =
- _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
- x0 = _mm_unpacklo_epi8(
- x0, x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
- x2 =
- _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
- x3 =
- _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
- x1 = _mm_unpacklo_epi8(
- x2, x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
- x4 =
- _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
- x5 =
- _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
- // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(x4, x5);
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
- x6 =
- _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
- x7 =
- _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
- // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(x6, x7);
+ x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- x4 = _mm_unpacklo_epi16(x0, x1);
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- x5 = _mm_unpacklo_epi16(x2, x3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 0 * out_p),
- _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
- _mm_storeh_pd((double *)(out + 1 * out_p),
- _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 2 * out_p),
- _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
- _mm_storeh_pd((double *)(out + 3 * out_p),
- _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
+ transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+ &q1);
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- x4 = _mm_unpackhi_epi16(x0, x1);
- // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- x5 = _mm_unpackhi_epi16(x2, x3);
- // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- x6 = _mm_unpacklo_epi32(x4, x5);
- _mm_storel_pd((double *)(out + 4 * out_p),
- _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
- _mm_storeh_pd((double *)(out + 5 * out_p),
- _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi32(x4, x5);
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
- _mm_storel_pd((double *)(out + 6 * out_p),
- _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
- _mm_storeh_pd((double *)(out + 7 * out_p),
- _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
- } while (++idx8x8 < num_8x8_to_transpose);
-}
+ p1 = _mm_srli_si128(ps1ps0, 8);
+ q1 = _mm_srli_si128(qs1qs0, 8);
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
- // Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+ &d5, &d6, &d7);
- // Loop filtering
- aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
- transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
+ xx_storel_32((s - 2 + 0 * p), d0);
+ xx_storel_32((s - 2 + 1 * p), d1);
+ xx_storel_32((s - 2 + 2 * p), d2);
+ xx_storel_32((s - 2 + 3 * p), d3);
+ xx_storel_32((s - 2 + 4 * p), d4);
+ xx_storel_32((s - 2 + 5 * p), d5);
+ xx_storel_32((s - 2 + 6 * p), d6);
+ xx_storel_32((s - 2 + 7 * p), d7);
}
void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
@@ -1603,65 +1401,96 @@
}
void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh) {
- __m128i d0d1, d2d3, d4d5, d6d7;
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+ __m128i p2, p0, q0, q2;
+ __m128i x2, x1, x0, x3;
+ __m128i q1q0, p1p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+ x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+ transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+ // Loop filtering
+ lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
+ &q2, &blimit, &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+ &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i d1, d3, d5, d7;
- __m128i p2_8, p1_8, p0_8, q0_8, q1_8, q2_8, p3_8, q3_8;
- __m128i q1q0, p1p0, p0, q0, p2, q2;
+ __m128i q1q0, p1p0;
+ __m128i p2, p1, q1, q2;
+ __m128i d0d1, d2d3, d4d5, d6d7;
- p3_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
- p2_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
- p1_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
- p0_8 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
- q0_8 = _mm_setzero_si128();
- q1_8 = _mm_setzero_si128();
- q2_8 = _mm_setzero_si128();
- q3_8 = _mm_setzero_si128();
+ x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
- transpose8x8_sse2(&p3_8, &p2_8, &p1_8, &p0_8, &q0_8, &q1_8, &q2_8, &q3_8,
- &d0d1, &d2d3, &d4d5, &d6d7);
+ transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
d1 = _mm_srli_si128(d0d1, 8);
d3 = _mm_srli_si128(d2d3, 8);
d5 = _mm_srli_si128(d4d5, 8);
d7 = _mm_srli_si128(d6d7, 8);
- // Loop filtering
lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
- &p1p0, &p2, &q2, blimit, limit, thresh);
+ &p1p0, &p2, &q2, &blimit, &limit, &thresh);
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
- transpose8x8_sse2(&d0d1, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0d1, &d2d3,
+ transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
&d4d5, &d6d7);
_mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
_mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
_mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
_mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
-}
-
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
- unsigned char *src[2];
- unsigned char *dst[2];
- // Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
- // Loop filtering
- aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1);
- src[0] = t_dst;
- src[1] = t_dst + 8;
- dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
- // Transpose back
- transpose8x8(src, 16, dst, p, 2);
+ _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+ _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+ _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
}
void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
const unsigned char *blimit,
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 9938af3..6c4334a 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -504,6 +504,10 @@
::testing::ValuesIn(kLoop8Test6));
const dual_loop_param_t kLoop8Test9[] = {
+ make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
8),
make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)