Improved aom_smooth_predictor_16x 32,16,8
Tests showed the following improvements:
16x32: ~9.2%
16x16: ~6.9%
16x8: ~3.2%
Change-Id: Id9f23928cde28ba450ba941415f9fcdf04d620c3
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 361d9a8..f911ac0 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -710,199 +710,6 @@
smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
}
-// pixels[0]: above and below_pred interleave vector, 1/4
-// pixels[1]: above and below_pred interleave vector, 2/4
-// pixels[2]: above and below_pred interleave vector, 3/4
-// pixels[3]: above and below_pred interleave vector, 3/4
-// pixels[4]: left vector
-// pixels[5]: left vector, h = 32 only
-// pixels[6]: right_pred vector
-static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- __m128i ab = _mm_load_si128((const __m128i *)above);
- pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
- pixels[4] = _mm_load_si128((const __m128i *)left);
- pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x = _mm_unpacklo_epi8(ab, zero);
- pixels[0] = _mm_unpacklo_epi16(x, bp);
- pixels[1] = _mm_unpackhi_epi16(x, bp);
-
- x = _mm_unpackhi_epi8(ab, zero);
- pixels[2] = _mm_unpacklo_epi16(x, bp);
- pixels[3] = _mm_unpackhi_epi16(x, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// ... ...
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-// ... ...
-static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
- __m128i *weight_h, __m128i *weight_w) {
- const __m128i zero = _mm_setzero_si128();
- __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
- __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
- __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
- if (height == 8) {
- weight_h[0] = _mm_unpacklo_epi8(w8, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h
-
- __m128i x = _mm_unpacklo_epi8(w16, zero);
- __m128i y = _mm_sub_epi16(d, x);
- weight_w[0] = _mm_unpacklo_epi16(x, y);
- weight_w[1] = _mm_unpackhi_epi16(x, y);
- x = _mm_unpackhi_epi8(w16, zero);
- y = _mm_sub_epi16(d, x);
- weight_w[2] = _mm_unpacklo_epi16(x, y);
- weight_w[3] = _mm_unpackhi_epi16(x, y);
- }
-
- if (height == 16) {
- weight_h[0] = _mm_unpacklo_epi8(w16, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(w16, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
- weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
- weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
- weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
- weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
- }
-
- if (height == 32) {
- weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
- __m128i x = _mm_unpacklo_epi8(w16, zero);
- __m128i y = _mm_sub_epi16(d, x);
- weight_w[0] = _mm_unpacklo_epi16(x, y);
- weight_w[1] = _mm_unpackhi_epi16(x, y);
- x = _mm_unpackhi_epi8(w16, zero);
- y = _mm_sub_epi16(d, x);
- weight_w[2] = _mm_unpacklo_epi16(x, y);
- weight_w[3] = _mm_unpackhi_epi16(x, y);
-
- weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
- weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
- weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
- weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
- }
-}
-
-static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
- const __m128i *ww, uint8_t *dst,
- ptrdiff_t stride, int quarter) {
- __m128i d = _mm_set1_epi16(0x100);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
- __m128i rep =
- (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
- const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
- __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
- __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
- __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
- __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
-
- __m128i b = _mm_shuffle_epi8(left, rep);
- b = _mm_unpacklo_epi16(b, pixels[6]);
- __m128i sum0 = _mm_madd_epi16(b, ww[0]);
- __m128i sum1 = _mm_madd_epi16(b, ww[1]);
- __m128i sum2 = _mm_madd_epi16(b, ww[2]);
- __m128i sum3 = _mm_madd_epi16(b, ww[3]);
-
- s0 = _mm_add_epi32(s0, sum0);
- s0 = _mm_add_epi32(s0, round);
- s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
- s1 = _mm_add_epi32(s1, sum1);
- s1 = _mm_add_epi32(s1, round);
- s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
- s2 = _mm_add_epi32(s2, sum2);
- s2 = _mm_add_epi32(s2, round);
- s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
-
- s3 = _mm_add_epi32(s3, sum3);
- s3 = _mm_add_epi32(s3, round);
- s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
-
- sum0 = _mm_packus_epi16(s0, s1);
- sum0 = _mm_shuffle_epi8(sum0, gat);
- sum1 = _mm_packus_epi16(s2, s3);
- sum1 = _mm_shuffle_epi8(sum1, gat);
-
- _mm_storel_epi64((__m128i *)dst, sum0);
- _mm_storel_epi64((__m128i *)(dst + 8), sum1);
-
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[7];
- load_pixel_w16(above, left, 8, pixels);
-
- __m128i wh[2], ww[4];
- load_weight_w16(sm_weight_arrays, 8, wh, ww);
-
- smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[7];
- load_pixel_w16(above, left, 16, pixels);
-
- __m128i wh[4], ww[4];
- load_weight_w16(sm_weight_arrays, 16, wh, ww);
-
- smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[7];
- load_pixel_w16(above, left, 32, pixels);
-
- __m128i wh[8], ww[4];
- load_weight_w16(sm_weight_arrays, 32, wh, ww);
-
- smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
- dst += stride << 3;
- smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
- dst += stride << 3;
- smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
-}
-
static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left, uint32_t bw,
@@ -963,6 +770,23 @@
dst += stride;
}
}
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,