Add ssse3 aom_smooth_h_predictor_16,32,64xh
Change-Id: If60e0f570ddc739cca7ff129af5fb35106ea23c6
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a059810..75679f5 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -218,6 +218,18 @@
specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
# by multiply and shift.
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 64f42eb..069a3bf 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -1161,12 +1161,6 @@
smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
}
-void aom_smooth_v_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
@@ -1403,3 +1397,116 @@
dst += stride << 3;
smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, uint32_t bw,
+ uint32_t bh) {
+ const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value =
+ _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+ const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+ const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+ for (uint32_t y = 0; y < bh; ++y) {
+ const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+ const __m128i tr_ly =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+ for (uint32_t x = 0; x < bw; x += 8) {
+ const __m128i weights_x =
+ _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+ const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+ const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+ const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+ const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+ __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+ __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+ pred_lo = _mm_add_epi32(pred_lo, pred_round);
+ pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+ pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+ pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+ __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ pred = _mm_shuffle_epi8(pred, gat);
+ _mm_storel_epi64((__m128i *)(dst + x), pred);
+ }
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
+}
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index f017a86..329a7d0 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -624,20 +624,25 @@
INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x16_ssse3,
aom_smooth_predictor_16x16_ssse3,
- aom_smooth_v_predictor_16x16_ssse3, NULL)
+ aom_smooth_v_predictor_16x16_ssse3,
+ aom_smooth_h_predictor_16x16_ssse3)
INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
- aom_smooth_v_predictor_16x8_ssse3, NULL)
+ aom_smooth_v_predictor_16x8_ssse3,
+ aom_smooth_h_predictor_16x8_ssse3)
INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x32_ssse3,
aom_smooth_predictor_16x32_ssse3,
- aom_smooth_v_predictor_16x32_ssse3, NULL)
+ aom_smooth_v_predictor_16x32_ssse3,
+ aom_smooth_h_predictor_16x32_ssse3)
INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x64_ssse3,
aom_smooth_predictor_16x64_ssse3,
- aom_smooth_v_predictor_16x64_ssse3, NULL)
+ aom_smooth_v_predictor_16x64_ssse3,
+ aom_smooth_h_predictor_16x64_ssse3)
INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, aom_smooth_v_predictor_16x4_ssse3, NULL)
+ NULL, aom_smooth_v_predictor_16x4_ssse3,
+ aom_smooth_h_predictor_16x4_ssse3)
#endif // HAVE_SSSE3
#if HAVE_AVX2
@@ -724,17 +729,21 @@
INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x32_ssse3,
aom_smooth_predictor_32x32_ssse3,
- aom_smooth_v_predictor_32x32_ssse3, NULL)
+ aom_smooth_v_predictor_32x32_ssse3,
+ aom_smooth_h_predictor_32x32_ssse3)
INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x16_ssse3,
aom_smooth_predictor_32x16_ssse3,
- aom_smooth_v_predictor_32x16_ssse3, NULL)
+ aom_smooth_v_predictor_32x16_ssse3,
+ aom_smooth_h_predictor_32x16_ssse3)
INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x64_ssse3,
aom_smooth_predictor_32x64_ssse3,
- aom_smooth_v_predictor_32x64_ssse3, NULL)
+ aom_smooth_v_predictor_32x64_ssse3,
+ aom_smooth_h_predictor_32x64_ssse3)
INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, aom_smooth_v_predictor_32x8_ssse3, NULL)
+ NULL, aom_smooth_v_predictor_32x8_ssse3,
+ aom_smooth_h_predictor_32x8_ssse3)
#endif // HAVE_SSSE3
#if HAVE_AVX2
@@ -817,15 +826,18 @@
INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x64_ssse3,
aom_smooth_predictor_64x64_ssse3,
- aom_smooth_v_predictor_64x64_ssse3, NULL)
+ aom_smooth_v_predictor_64x64_ssse3,
+ aom_smooth_h_predictor_64x64_ssse3)
INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x32_ssse3,
aom_smooth_predictor_64x32_ssse3,
- aom_smooth_v_predictor_64x32_ssse3, NULL)
+ aom_smooth_v_predictor_64x32_ssse3,
+ aom_smooth_h_predictor_64x32_ssse3)
INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x16_ssse3,
aom_smooth_predictor_64x16_ssse3,
- aom_smooth_v_predictor_64x16_ssse3, NULL)
+ aom_smooth_v_predictor_64x16_ssse3,
+ aom_smooth_h_predictor_64x16_ssse3)
#endif
#if HAVE_AVX2