Improved aom_smooth_predictor_16x 32,16,8

Tests showed the following improvements:
16x32: ~9.2%
16x16: ~6.9%
16x8: ~3.2%

Change-Id: Id9f23928cde28ba450ba941415f9fcdf04d620c3
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 361d9a8..f911ac0 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -710,199 +710,6 @@
   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
-// pixels[0]: above and below_pred interleave vector, 1/4
-// pixels[1]: above and below_pred interleave vector, 2/4
-// pixels[2]: above and below_pred interleave vector, 3/4
-// pixels[3]: above and below_pred interleave vector, 3/4
-// pixels[4]: left vector
-// pixels[5]: left vector, h = 32 only
-// pixels[6]: right_pred vector
-static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab = _mm_load_si128((const __m128i *)above);
-  pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
-  pixels[4] = _mm_load_si128((const __m128i *)left);
-  pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(ab, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
-
-  x = _mm_unpackhi_epi8(ab, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// ... ...
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-// ... ...
-static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height == 8) {
-    weight_h[0] = _mm_unpacklo_epi8(w8, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);  // scale - weight_h
-
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
-  }
-
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
-  }
-
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
-
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-    __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
-    __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[6]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-    __m128i sum2 = _mm_madd_epi16(b, ww[2]);
-    __m128i sum3 = _mm_madd_epi16(b, ww[3]);
-
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    s2 = _mm_add_epi32(s2, sum2);
-    s2 = _mm_add_epi32(s2, round);
-    s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
-
-    s3 = _mm_add_epi32(s3, sum3);
-    s3 = _mm_add_epi32(s3, round);
-    s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    sum1 = _mm_packus_epi16(s2, s3);
-    sum1 = _mm_shuffle_epi8(sum1, gat);
-
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    _mm_storel_epi64((__m128i *)(dst + 8), sum1);
-
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 8, pixels);
-
-  __m128i wh[2], ww[4];
-  load_weight_w16(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 16, pixels);
-
-  __m128i wh[4], ww[4];
-  load_weight_w16(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 32, pixels);
-
-  __m128i wh[8], ww[4];
-  load_weight_w16(sm_weight_arrays, 32, wh, ww);
-
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
-  dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
-  dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
-}
-
 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
                                         const uint8_t *above,
                                         const uint8_t *left, uint32_t bw,
@@ -963,6 +770,23 @@
     dst += stride;
   }
 }
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
 
 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,