[x86]: Improve aom_smooth_h_predictor_4x{4,8,16}_ssse3().

Port libgav1's horizontal smooth to libaom.

~1.23x faster.

Change-Id: I7a9ae2b8cc3cfa231e715e28a07d32d247f24ec1
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index bcd4a3e..0b28fea 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -914,18 +914,28 @@
   return _mm_loadu_si128((const __m128i *)(a));
 }
 
-static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
-  _mm_storeu_si128((__m128i *)(a), v);
+static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
 }
 
 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
   _mm_storel_epi64((__m128i *)(a), v);
 }
 
+static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)(a), v);
+}
+
 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
   return _mm_unpacklo_epi8((x), _mm_setzero_si128());
 }
 
+static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
+  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
+  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
+}
+
 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
   return _mm_unpacklo_epi16((x), _mm_setzero_si128());
 }
@@ -2035,95 +2045,171 @@
 
 // -----------------------------------------------------------------------------
 // SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const int *)left)[0]);
-  else if (height == 8)
-    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((int16_t)above[3]);
+static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
+    const __m128i *scaled_top_right, const __m128i *round) {
+  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
+  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
 }
 
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(int height, __m128i *weights) {
-  (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&smooth_weights[0]);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
-  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
-  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+void aom_smooth_h_predictor_4x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i left = cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
 }
 
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round =
-      _mm_set1_epi32((1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
+void aom_smooth_h_predictor_4x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
 
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
-    b = _mm_unpacklo_epi16(b, pixel[1]);
-    __m128i sum = _mm_madd_epi16(b, weight[0]);
-
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, SMOOTH_WEIGHT_LOG2_SCALE);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(int *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
 }
 
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 4, pixels);
+void aom_smooth_h_predictor_4x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
 
-  __m128i weights;
-  load_weight_h_w4(4, &weights);
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
 
-  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
+  left = cvtepu8_epi32(Load4(left_column + 8));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
 
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 8, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 16, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-  dst += stride << 3;
-
-  pixels[0] = _mm_srli_si128(pixels[0], 8);
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  left = cvtepu8_epi32(Load4(left_column + 12));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
 }
 
 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,