Improved ssse3 version of aom_smooth_predictor_32x 16,32
Tests show ~12% improvement.
Change-Id: I368de03c1e74f8507e71503f23cd3e83cc429e64
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 9f23579..47abcb8 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -880,166 +880,6 @@
smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
}
-static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- __m128i ab0 = _mm_load_si128((const __m128i *)above);
- __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16));
-
- pixels[10] = _mm_set1_epi16((uint16_t)above[31]);
- pixels[8] = _mm_load_si128((const __m128i *)left);
- pixels[9] = _mm_load_si128((const __m128i *)(left + 16));
-
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x = _mm_unpacklo_epi8(ab0, zero);
- pixels[0] = _mm_unpacklo_epi16(x, bp);
- pixels[1] = _mm_unpackhi_epi16(x, bp);
-
- x = _mm_unpackhi_epi8(ab0, zero);
- pixels[2] = _mm_unpacklo_epi16(x, bp);
- pixels[3] = _mm_unpackhi_epi16(x, bp);
-
- x = _mm_unpacklo_epi8(ab1, zero);
- pixels[4] = _mm_unpacklo_epi16(x, bp);
- pixels[5] = _mm_unpackhi_epi16(x, bp);
-
- x = _mm_unpackhi_epi8(ab1, zero);
- pixels[6] = _mm_unpacklo_epi16(x, bp);
- pixels[7] = _mm_unpackhi_epi16(x, bp);
-}
-
-static INLINE void load_weight_w32(const uint8_t *weight_array, int height,
- __m128i *weight_h, __m128i *weight_w) {
- const __m128i zero = _mm_setzero_si128();
- __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
- __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
- if (height == 16) {
- weight_h[0] = _mm_unpacklo_epi8(w16, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(w16, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
- __m128i x = _mm_unpacklo_epi8(w32_0, zero);
- __m128i y = _mm_sub_epi16(d, x);
- weight_w[0] = _mm_unpacklo_epi16(x, y);
- weight_w[1] = _mm_unpackhi_epi16(x, y);
-
- x = _mm_unpackhi_epi8(w32_0, zero);
- y = _mm_sub_epi16(d, x);
- weight_w[2] = _mm_unpacklo_epi16(x, y);
- weight_w[3] = _mm_unpackhi_epi16(x, y);
-
- x = _mm_unpacklo_epi8(w32_1, zero);
- y = _mm_sub_epi16(d, x);
- weight_w[4] = _mm_unpacklo_epi16(x, y);
- weight_w[5] = _mm_unpackhi_epi16(x, y);
-
- x = _mm_unpackhi_epi8(w32_1, zero);
- y = _mm_sub_epi16(d, x);
- weight_w[6] = _mm_unpacklo_epi16(x, y);
- weight_w[7] = _mm_unpackhi_epi16(x, y);
- }
-
- if (height == 32) {
- weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
- weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
- weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
- weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
- weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-
- weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
- weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
- weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
- weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
-
- weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]);
- weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]);
- weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]);
- weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]);
- }
-}
-
-static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh,
- const __m128i *ww, uint8_t *dst,
- ptrdiff_t stride, int quarter) {
- __m128i d = _mm_set1_epi16(0x100);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
- __m128i rep =
- (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
- const __m128i left = (quarter < 2) ? pixels[8] : pixels[9];
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-
- int j;
- __m128i s[8];
- __m128i b = _mm_shuffle_epi8(left, rep);
- b = _mm_unpacklo_epi16(b, pixels[10]);
-
- for (j = 0; j < 8; ++j) {
- s[j] = _mm_madd_epi16(pixels[j], wh_sc);
- s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j]));
- s[j] = _mm_add_epi32(s[j], round);
- s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale);
- }
-
- for (j = 0; j < 8; j += 2) {
- __m128i sum = _mm_packus_epi16(s[j], s[j + 1]);
- sum = _mm_shuffle_epi8(sum, gat);
- _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum);
- }
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[11];
- load_pixel_w32(above, left, 16, pixels);
-
- __m128i wh[4], ww[8];
- load_weight_w32(sm_weight_arrays, 16, wh, ww);
-
- smooth_pred_32x8(pixels, wh, ww, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
-}
-
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[11];
- load_pixel_w32(above, left, 32, pixels);
-
- __m128i wh[8], ww[8];
- load_weight_w32(sm_weight_arrays, 32, wh, ww);
-
- smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
- dst += stride << 3;
- smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2);
- dst += stride << 3;
- smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3);
-}
-
static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left, uint32_t bw,
@@ -1101,6 +941,18 @@
}
}
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {