Add ssse3 aom_smooth_h_predictor_4xh

Change-Id: I460b2e24a8d791b7f3119795d3f0e61d2db89362
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b5046d8..0ad9e0d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -211,6 +211,10 @@
 specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
 specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
 
+specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+
 # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
 # by multiply and shift.
 specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 0ff6002..0becfb9 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -1190,3 +1190,96 @@
                                         const uint8_t *left) {
   smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
 }
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
+
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16(0x8000);
+
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
+
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e84e192..5ecf51b 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -447,12 +447,15 @@
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
-                aom_smooth_v_predictor_4x4_ssse3, NULL)
+                aom_smooth_v_predictor_4x4_ssse3,
+                aom_smooth_h_predictor_4x4_ssse3)
 INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
-                aom_smooth_v_predictor_4x8_ssse3, NULL)
+                aom_smooth_v_predictor_4x8_ssse3,
+                aom_smooth_h_predictor_4x8_ssse3)
 INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, aom_smooth_v_predictor_4x16_ssse3, NULL)
+                NULL, aom_smooth_v_predictor_4x16_ssse3,
+                aom_smooth_h_predictor_4x16_ssse3)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2