Remove unused functions post LOWPRECISION_BLEND

Change-Id: Id6bcd4b09bd00e6f07f484f2c6f82913a43fda8d
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index 33eb5ce..6747cae 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -19,22 +19,6 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-// Make a mask for coefficients of 10/12 tap filters. The coefficients are
-// packed "89ab89ab". If it's a 12-tap filter, we want all 1's; if it's a
-// 10-tap filter, we want "11001100" to just match the 8,9 terms.
-static __m128i make_1012_mask(int ntaps) {
-  uint32_t low = 0xffffffff;
-  uint32_t high = (ntaps == 12) ? low : 0;
-  return _mm_set_epi32(high, low, high, low);
-}
-
-// Load an SSE register from p and bitwise AND with a.
-static __m128i load_and_128i(const void *p, __m128i a) {
-  const __m128d ad = _mm_castsi128_pd(a);
-  const __m128d bd = _mm_load1_pd((const double *)p);
-  return _mm_castpd_si128(_mm_and_pd(ad, bd));
-}
-
 // A specialised version of hfilter, the horizontal filter for
 // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
 static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
@@ -114,28 +98,6 @@
   }
 }
 
-// Do a 12-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
-                           __m128i coeff8d) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i data8d = _mm_loadu_si128((__m128i *)(src + 8));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  const __m128i conv8d = _mm_mullo_epi32(data8d, coeff8d);
-  return _mm_add_epi32(_mm_add_epi32(conv03, conv47), conv8d);
-}
-
-// Do an 8-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32_8(const int32_t *src, __m128i coeff03,
-                             __m128i coeff47) {
-  const __m128i data03 = _mm_loadu_si128((__m128i *)src);
-  const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
-  const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
-  const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
-  return _mm_add_epi32(conv03, conv47);
-}
-
 static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
   __m128i data = _mm_loadu_si128((__m128i *)src);
   return _mm_madd_epi16(data, coeff);
@@ -295,15 +257,6 @@
            filter_params_y, conv_params, 8);
 }
 
-// An wrapper to generate the SHUFPD instruction with __m128i types (just
-// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
-// casts)
-static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
-  __m128d ad = _mm_castsi128_pd(a);
-  __m128d bd = _mm_castsi128_pd(b);
-  return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
-}
-
 // A specialised version of hfilter, the horizontal filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index 57d0bef..6fdfb09 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -17,210 +17,6 @@
 #include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-// 16 epi16 pixels
-static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*u, max);
-  clamped = _mm256_andnot_si256(mask, *u);
-  mask = _mm256_and_si256(mask, max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  const __m256i zero = _mm256_setzero_si256();
-  mask = _mm256_cmpgt_epi16(clamped, zero);
-  *u = _mm256_and_si256(clamped, mask);
-}
-
-// 8 epi16 pixels
-static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  __m128i clamped, mask;
-
-  mask = _mm_cmpgt_epi16(*u, max);
-  clamped = _mm_andnot_si128(mask, *u);
-  mask = _mm_and_si128(mask, max);
-  clamped = _mm_or_si128(mask, clamped);
-
-  const __m128i zero = _mm_setzero_si128();
-  mask = _mm_cmpgt_epi16(clamped, zero);
-  *u = _mm_and_si128(clamped, mask);
-}
-
-// Work on multiple of 32 pixels
-static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
-                                          const __m256i *rnd, int shift,
-                                          int num) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, 8);
-    pixel_clamp_avx2(&x2, 8);
-
-    x0 = _mm256_packus_epi16(x0, x2);
-    x1 = _mm256_loadu_si256((const __m256i *)sindex);
-    x2 = _mm256_permutevar8x32_epi32(x0, x1);
-
-    _mm256_storeu_si256((__m256i *)dst, x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
-                                        const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, 8);
-
-  const __m256i x2 = _mm256_packus_epi16(x0, x0);
-  x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x2, x1);
-
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
-                                       const __m256i *rnd, int shift) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x0 = _mm256_srai_epi32(x0, shift);
-
-  x0 = _mm256_packs_epi32(x0, x0);
-  pixel_clamp_avx2(&x0, 8);
-
-  x0 = _mm256_packus_epi16(x0, x0);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
-  x0 = _mm256_permutevar8x32_epi32(x0, x1);
-
-  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
-                                       const __m128i *rnd, int shift) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, 8);
-
-  x = _mm_packus_epi16(x, x);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(x);
-}
-
-static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
-                                                 uint16_t *dst,
-                                                 const __m256i *rnd, int shift,
-                                                 int num, int bd) {
-  do {
-    __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-    __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-    __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
-    __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
-    x0 = _mm256_add_epi32(x0, *rnd);
-    x1 = _mm256_add_epi32(x1, *rnd);
-    x2 = _mm256_add_epi32(x2, *rnd);
-    x3 = _mm256_add_epi32(x3, *rnd);
-
-    x0 = _mm256_srai_epi32(x0, shift);
-    x1 = _mm256_srai_epi32(x1, shift);
-    x2 = _mm256_srai_epi32(x2, shift);
-    x3 = _mm256_srai_epi32(x3, shift);
-
-    x0 = _mm256_packs_epi32(x0, x1);
-    x2 = _mm256_packs_epi32(x2, x3);
-
-    pixel_clamp_avx2(&x0, bd);
-    pixel_clamp_avx2(&x2, bd);
-
-    x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-    x2 = _mm256_permute4x64_epi64(x2, 0xD8);
-
-    _mm256_storeu_si256((__m256i *)dst, x0);
-    _mm256_storeu_si256((__m256i *)(dst + 16), x2);
-    src += 32;
-    dst += 32;
-    num--;
-  } while (num > 0);
-}
-
-static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
-                                               uint16_t *dst,
-                                               const __m256i *rnd, int shift,
-                                               int bd) {
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
-  x0 = _mm256_add_epi32(x0, *rnd);
-  x1 = _mm256_add_epi32(x1, *rnd);
-
-  x0 = _mm256_srai_epi32(x0, shift);
-  x1 = _mm256_srai_epi32(x1, shift);
-
-  x0 = _mm256_packs_epi32(x0, x1);
-  pixel_clamp_avx2(&x0, bd);
-
-  x0 = _mm256_permute4x64_epi64(x0, 0xD8);
-  _mm256_storeu_si256((__m256i *)dst, x0);
-}
-
-static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
-                                              const __m256i *rnd, int shift,
-                                              int bd) {
-  __m256i x = _mm256_loadu_si256((const __m256i *)src);
-  x = _mm256_add_epi32(x, *rnd);
-  x = _mm256_srai_epi32(x, shift);
-
-  x = _mm256_packs_epi32(x, x);
-  pixel_clamp_avx2(&x, bd);
-
-  x = _mm256_permute4x64_epi64(x, 0xD8);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
-}
-
-static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
-                                              const __m128i *rnd, int shift,
-                                              int bd) {
-  __m128i x = _mm_loadu_si128((const __m128i *)src);
-  x = _mm_add_epi32(x, *rnd);
-  x = _mm_srai_epi32(x, shift);
-
-  x = _mm_packs_epi32(x, x);
-  pixel_clamp_sse2(&x, bd);
-  _mm_storel_epi64((__m128i *)dst, x);
-}
-
 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             InterpFilterParams *filter_params_x,