Add sse2/ssse3 intra predictors for 8x32
dc, dc_left, dc_top, dc_128, v, h, smooth, and paeth
Change-Id: I53816087e3098b65c9c58a76b09e38f297568e40
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9dd162f..9545838 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -86,6 +86,7 @@
specialize qw/aom_dc_top_predictor_8x4 sse2/;
specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
specialize qw/aom_dc_top_predictor_16x8 sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_top_predictor_16x32 sse2/;
@@ -102,6 +103,7 @@
specialize qw/aom_dc_left_predictor_8x4 sse2/;
specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
specialize qw/aom_dc_left_predictor_16x8 sse2/;
specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_left_predictor_16x32 sse2/;
@@ -118,6 +120,7 @@
specialize qw/aom_dc_128_predictor_8x4 sse2/;
specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
specialize qw/aom_dc_128_predictor_16x8 sse2/;
specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_128_predictor_16x32 sse2/;
@@ -134,6 +137,7 @@
specialize qw/aom_v_predictor_8x4 sse2/;
specialize qw/aom_v_predictor_8x8 neon msa sse2/;
specialize qw/aom_v_predictor_8x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
specialize qw/aom_v_predictor_16x8 sse2/;
specialize qw/aom_v_predictor_16x16 neon msa sse2/;
specialize qw/aom_v_predictor_16x32 sse2/;
@@ -150,6 +154,7 @@
specialize qw/aom_h_predictor_8x4 sse2/;
specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
specialize qw/aom_h_predictor_8x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
specialize qw/aom_h_predictor_16x8 sse2/;
specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
specialize qw/aom_h_predictor_16x32 sse2/;
@@ -166,6 +171,7 @@
specialize qw/aom_paeth_predictor_8x4 ssse3/;
specialize qw/aom_paeth_predictor_8x8 ssse3/;
specialize qw/aom_paeth_predictor_8x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x32 ssse3/;
specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
@@ -187,6 +193,7 @@
specialize qw/aom_smooth_predictor_8x4 ssse3/;
specialize qw/aom_smooth_predictor_8x8 ssse3/;
specialize qw/aom_smooth_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x32 ssse3/;
specialize qw/aom_smooth_predictor_16x8 ssse3/;
specialize qw/aom_smooth_predictor_16x16 ssse3/;
specialize qw/aom_smooth_predictor_16x32 ssse3/;
@@ -246,6 +253,7 @@
specialize qw/aom_dc_predictor_8x4 sse2/;
specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_8x16 sse2/;
+specialize qw/aom_dc_predictor_8x32 sse2/;
specialize qw/aom_dc_predictor_16x8 sse2/;
specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_16x32 sse2/;
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 14b43da..9b71034 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -171,6 +171,19 @@
dc_store_8xh(&row, 16, dst, stride);
}
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 20;
+ sum /= 40;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m128i sum_left = dc_sum_8(left);
@@ -330,6 +343,18 @@
dc_store_8xh(&row, 16, dst, stride);
}
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16((uint16_t)4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
@@ -498,6 +523,19 @@
dc_store_8xh(&row, 16, dst, stride);
}
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32(left);
+ const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
@@ -645,6 +683,14 @@
dc_store_8xh(&row, 16, dst, stride);
}
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
@@ -747,6 +793,13 @@
dc_store_8xh(&row, 16, dst, stride);
}
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m128i row = _mm_load_si128((__m128i const *)above);
@@ -937,65 +990,80 @@
_mm_storel_epi64((__m128i *)dst, row3);
}
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int count) {
+ (void)above;
+ for (int i = 0; i < count; ++i) {
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+ left += 16;
+ }
+}
+
void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- (void)above;
- const __m128i left_col = _mm_load_si128((__m128i const *)left);
- __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
- __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+ h_predictor_8x16xc(dst, stride, above, left, 1);
+}
- __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
- __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
- row0 = _mm_shufflelo_epi16(left_col_low, 0);
- row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ h_predictor_8x16xc(dst, stride, above, left, 2);
}
static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 6602fcf..9d5da32 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -165,6 +165,28 @@
}
}
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int j = 0; j < 2; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16(0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
// Return 16 8-bit pixels in one row
static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
const __m128i *top1,
@@ -630,23 +652,44 @@
// pixels[1]: above and below_pred interleave vector, second half
// pixels[2]: left vector
// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
int height, __m128i *pixels) {
- __m128i d = _mm_loadl_epi64((const __m128i *)above);
- pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
- pixels[2] = _mm_load_si128((const __m128i *)left);
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
const __m128i zero = _mm_setzero_si128();
-
+ const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+ __m128i d = _mm_loadl_epi64((const __m128i *)above);
d = _mm_unpacklo_epi8(d, zero);
pixels[0] = _mm_unpacklo_epi16(d, bp);
pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+ pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+ if (height == 4) {
+ pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+ } else if (height == 8) {
+ pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+ } else if (height == 16) {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ } else {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+ pixels[7] = pixels[3];
+ }
}
// weight_h[0]: weight_h vector
// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
@@ -655,7 +698,6 @@
const int we_offset = height < 8 ? 4 : 8;
__m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
weight_h[0] = _mm_unpacklo_epi8(we, zero);
-
const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
@@ -676,6 +718,19 @@
weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
weight_h[2] = _mm_unpackhi_epi8(we, zero);
weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i weight_lo =
+ _mm_loadu_si128((const __m128i *)&weight_array[32]);
+ weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ const __m128i weight_hi =
+ _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+ weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+ weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
}
}
@@ -757,6 +812,24 @@
smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
}
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[8];
+ load_pixel_w8(above, left, 32, pixels);
+
+ __m128i wh[8], ww[2];
+ load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+ smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left, uint32_t bw,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e2b85c2..2a80b75 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -526,6 +526,10 @@
aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
+ aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
+ aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
+ aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
#endif // HAVE_SSE2
#if HAVE_SSSE3
@@ -541,8 +545,9 @@
aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
aom_smooth_v_predictor_8x16_ssse3,
aom_smooth_h_predictor_8x16_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, aom_smooth_v_predictor_8x32_ssse3,
+INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+ aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
+ aom_smooth_v_predictor_8x32_ssse3,
aom_smooth_h_predictor_8x32_ssse3)
#endif // HAVE_SSSE3