Remove some unused functions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 14c5a9a..59cde14 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -418,10 +418,8 @@
           - debug
           - release
   variables:
-    CMAKE_FLAGS: >-
-      -DENABLE_CCACHE=1
-      -DCMAKE_C_COMPILER=clang
-      -DCMAKE_CXX_COMPILER=clang++
+    EXTRA_CMAKE_FLAGS: >-
+      -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
 
 Example Build (x86_64-gcc):
   stage: build
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index ef68c63..9463048 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -21,19 +21,6 @@
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
-  return sum;
-}
-
-static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
-                                      const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
-  return sum;
-}
-
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index adaf428..cc6f67d 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -16,56 +16,6 @@
 #include "aom_dsp/x86/intrapred_x86.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y0 = _mm256_sad_epu8(x0, zero);
-  __m256i y1 = _mm256_sad_epu8(x1, zero);
-  y0 = _mm256_add_epi64(y0, y1);
-  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
-  y0 = _mm256_add_epi64(u0, y0);
-  u0 = _mm256_unpackhi_epi64(y0, y0);
-  return _mm256_add_epi16(y0, u0);
-}
-
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
-  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y = _mm256_sad_epu8(x, zero);
-  __m256i u = _mm256_permute2x128_si256(y, y, 1);
-  y = _mm256_add_epi64(u, y);
-  u = _mm256_unpackhi_epi64(y, y);
-  return _mm256_add_epi16(y, u);
-}
-
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
-                                    int height, uint8_t *dst,
-                                    ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r0);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
-    dst += stride;
-  }
-}
-
 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 06cf0eb..e1a3b3e 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -607,12 +607,6 @@
 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
 #define WIENER_MAX_EXT_SIZE 263
 
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
-  return sum;
-}
-
 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index ee7152c..5a8f14e 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -352,21 +352,6 @@
   }
 }
 
-static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
-                                    int mask_base, const uint8_t *src0,
-                                    int src0_stride, const uint8_t *src1,
-                                    int src1_stride, int h, int w) {
-  int i, j, m, diff;
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      diff =
-          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
-      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
-      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
-    }
-  }
-}
-
 static AOM_FORCE_INLINE void diffwtd_mask_highbd(
     uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
     int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
@@ -1066,25 +1051,6 @@
 }
 
 #if OPFL_COMBINE_INTERP_GRAD_LS
-static AOM_FORCE_INLINE void compute_pred_using_interp_grad(
-    const uint8_t *src1, const uint8_t *src2, int16_t *dst1, int16_t *dst2,
-    int bw, int bh, int d0, int d1) {
-  for (int i = 0; i < bh; ++i) {
-    for (int j = 0; j < bw; ++j) {
-      // To avoid overflow, we clamp d0*P0-d1*P1 and P0-P1. Since d0 and d1 are
-      // at most 5 bits, this clamping is only required in highbd, but it is
-      // also added here for consistency.
-      int32_t tmp_dst =
-          d0 * (int32_t)src1[i * bw + j] - d1 * (int32_t)src2[i * bw + j];
-      dst1[i * bw + j] = clamp(tmp_dst, INT16_MIN, INT16_MAX);
-      tmp_dst = d0 * ((int32_t)src1[i * bw + j] - (int32_t)src2[i * bw + j]);
-      dst2[i * bw + j] = clamp(tmp_dst, INT16_MIN, INT16_MAX);
-    }
-  }
-}
-#endif  // OPFL_COMBINE_INTERP_GRAD_LS
-
-#if OPFL_COMBINE_INTERP_GRAD_LS
 static AOM_FORCE_INLINE void compute_pred_using_interp_grad_highbd(
     const uint16_t *src1, const uint16_t *src2, int16_t *dst1, int16_t *dst2,
     int bw, int bh, int d0, int d1) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index ec7152d..64a8390 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -18,11 +18,6 @@
 
 #include "av1/common/x86/cfl_simd.h"
 
-// Load 32-bit integer from memory into the first element of dst.
-static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
-  return _mm_cvtsi32_si128(*((int *)mem_addr));
-}
-
 // Store 32-bit integer from the first element of a into memory.
 static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
   *((int *)mem_addr) = _mm_cvtsi128_si32(a);
diff --git a/av1/common/x86/optflow_refine_sse4.c b/av1/common/x86/optflow_refine_sse4.c
index 6a31b8f..5ebae7d 100644
--- a/av1/common/x86/optflow_refine_sse4.c
+++ b/av1/common/x86/optflow_refine_sse4.c
@@ -308,10 +308,6 @@
 #endif  // OPFL_BICUBIC_GRAD
 }
 
-static INLINE __m128i LoadLo8(const void *a) {
-  return _mm_loadl_epi64((const __m128i *)a);
-}
-
 static INLINE __m128i LoadAligned16(const void *a) {
   return _mm_load_si128((const __m128i *)a);
 }
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 24caca9..161a5ff 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -20,148 +20,6 @@
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
 
-static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
-                                 const __m128i *shuffle, const __m256i *kl) {
-  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
-  const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
-  const __m256i dst0 = yy_load_256(dst);
-  const __m256i r0 = _mm256_add_epi32(dst0, d0);
-  yy_store_256(dst, r0);
-}
-
-static INLINE void acc_stat_win7_one_line_avx2(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
-    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
-    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
-    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
-  int j, k, l;
-  const int wiener_win = WIENER_WIN;
-  // Main loop handles two pixels at a time
-  // We can assume that h_start is even, since it will always be aligned to
-  // a tile edge + some number of restoration units, and both of those will
-  // be 64-pixel aligned.
-  // However, at the edge of the image, h_end may be odd, so we need to handle
-  // that case correctly.
-  assert(h_start % 2 == 0);
-  const int h_end_even = h_end & ~1;
-  const int has_odd_pixel = h_end & 1;
-  for (j = h_start; j < h_end_even; j += 2) {
-    const uint8_t X1 = src[j];
-    const uint8_t X2 = src[j + 1];
-    *sumX += X1 + X2;
-    const uint8_t *dgd_ij = dgd + j;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        const uint8_t D2 = dgd_ijk[l + 1];
-        sumY[k][l] += D1 + D2;
-        M_int[k][l] += D1 * X1 + D2 * X2;
-
-        const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
-        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-  // If the width is odd, add in the final pixel
-  if (has_odd_pixel) {
-    const uint8_t X1 = src[j];
-    *sumX += X1;
-    const uint8_t *dgd_ij = dgd + j;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        sumY[k][l] += D1;
-        M_int[k][l] += D1 * X1;
-
-        // The `acc_stat_avx2` function wants its input to have interleaved
-        // copies of two pixels, but we only have one. However, the pixels
-        // are (effectively) used as inputs to a multiply-accumulate.
-        // So if we set the extra pixel slot to 0, then it is effectively
-        // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
-        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-}
-
-static INLINE void compute_stats_win7_opt_avx2(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
-  int i, j, k, l, m, n;
-  const int wiener_win = WIENER_WIN;
-  const int pixel_count = (h_end - h_start) * (v_end - v_start);
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
-
-  DECLARE_ALIGNED(32, int32_t,
-                  H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
-  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
-  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int32_t sumX = 0;
-  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
-  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
-  for (j = v_start; j < v_end; j += 64) {
-    const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
-      acc_stat_win7_one_line_avx2(
-          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
-    }
-    for (k = 0; k < wiener_win; ++k) {
-      for (l = 0; l < wiener_win; ++l) {
-        M_int64[k][l] += M_int32[k][l];
-        M_int32[k][l] = 0;
-      }
-    }
-    for (k = 0; k < WIENER_WIN2; ++k) {
-      for (l = 0; l < WIENER_WIN * 8; ++l) {
-        H_int64[k][l] += H_int32[k][l];
-        H_int32[k][l] = 0;
-      }
-    }
-  }
-
-  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
-  for (k = 0; k < wiener_win; k++) {
-    for (l = 0; l < wiener_win; l++) {
-      const int32_t idx0 = l * wiener_win + k;
-      M[idx0] =
-          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
-      int64_t *H_ = H + idx0 * wiener_win2;
-      int64_t *H_int_ = &H_int64[idx0][0];
-      for (m = 0; m < wiener_win; m++) {
-        for (n = 0; n < wiener_win; n++) {
-          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
-                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
-        }
-      }
-    }
-  }
-}
-
 static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
                                         const __m256i *shuffle,
                                         const __m256i *dgd_ijkl) {
@@ -505,99 +363,6 @@
   }
 }
 
-// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
-// C and H need to be computed.
-static AOM_INLINE void calc_proj_params_r0_r1_avx2(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  __m256i h00, h01, h11, c0, c1;
-  const __m256i zero = _mm256_setzero_si256();
-  h01 = h11 = c0 = c1 = h00 = zero;
-
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; j += 8) {
-      const __m256i u_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
-      const __m256i s_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
-      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
-      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
-      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
-      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
-      s = _mm256_sub_epi32(s, d);
-      f1 = _mm256_sub_epi32(f1, d);
-      f2 = _mm256_sub_epi32(f2, d);
-
-      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
-      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
-                                               _mm256_srli_epi64(f1, 32));
-      h00 = _mm256_add_epi64(h00, h00_even);
-      h00 = _mm256_add_epi64(h00, h00_odd);
-
-      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
-      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
-                                               _mm256_srli_epi64(f2, 32));
-      h01 = _mm256_add_epi64(h01, h01_even);
-      h01 = _mm256_add_epi64(h01, h01_odd);
-
-      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
-      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
-                                               _mm256_srli_epi64(f2, 32));
-      h11 = _mm256_add_epi64(h11, h11_even);
-      h11 = _mm256_add_epi64(h11, h11_odd);
-
-      const __m256i c0_even = _mm256_mul_epi32(f1, s);
-      const __m256i c0_odd =
-          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
-      c0 = _mm256_add_epi64(c0, c0_even);
-      c0 = _mm256_add_epi64(c0, c0_odd);
-
-      const __m256i c1_even = _mm256_mul_epi32(f2, s);
-      const __m256i c1_odd =
-          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
-      c1 = _mm256_add_epi64(c1, c1_even);
-      c1 = _mm256_add_epi64(c1, c1_odd);
-    }
-  }
-
-  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
-  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
-  c_low = _mm256_add_epi64(c_low, c_high);
-  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
-                                         _mm256_castsi256_si128(c_low));
-
-  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
-  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
-  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
-  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
-                                           _mm256_castsi256_si128(h0x_low));
-
-  // Using the symmetric properties of H,  calculations of H[1][0] are not
-  // needed.
-  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
-  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
-  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
-  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
-                                           _mm256_castsi256_si128(h1x_low));
-
-  xx_storeu_128(C, c_128bit);
-  xx_storeu_128(H[0], h0x_128bit);
-  xx_storeu_128(H[1], h1x_128bit);
-
-  H[0][0] /= size;
-  H[0][1] /= size;
-  H[1][1] /= size;
-
-  // Since H is a symmetric matrix
-  H[1][0] = H[0][1];
-  C[0] /= size;
-  C[1] /= size;
-}
-
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index f97ad56..5b454b7 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c
@@ -18,152 +18,6 @@
 #include "av1/common/restoration.h"
 #include "av1/encoder/pickrst.h"
 
-static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
-                                  const __m128i *shuffle, const __m128i *kl) {
-  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
-  const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
-  const __m128i d1 =
-      _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
-  const __m128i dst0 = xx_loadu_128(dst);
-  const __m128i dst1 = xx_loadu_128(dst + 4);
-  const __m128i r0 = _mm_add_epi32(dst0, d0);
-  const __m128i r1 = _mm_add_epi32(dst1, d1);
-  xx_storeu_128(dst, r0);
-  xx_storeu_128(dst + 4, r1);
-}
-
-static INLINE void acc_stat_win7_one_line_sse4_1(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
-    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
-    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
-    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
-  const int wiener_win = 7;
-  int j, k, l;
-  // Main loop handles two pixels at a time
-  // We can assume that h_start is even, since it will always be aligned to
-  // a tile edge + some number of restoration units, and both of those will
-  // be 64-pixel aligned.
-  // However, at the edge of the image, h_end may be odd, so we need to handle
-  // that case correctly.
-  assert(h_start % 2 == 0);
-  const int h_end_even = h_end & ~1;
-  const int has_odd_pixel = h_end & 1;
-  for (j = h_start; j < h_end_even; j += 2) {
-    const uint8_t *dgd_ij = dgd + j;
-    const uint8_t X1 = src[j];
-    const uint8_t X2 = src[j + 1];
-    *sumX += X1 + X2;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        const uint8_t D2 = dgd_ijk[l + 1];
-        sumY[k][l] += D1 + D2;
-        M_int[k][l] += D1 * X1 + D2 * X2;
-
-        const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
-        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-  // If the width is odd, add in the final pixel
-  if (has_odd_pixel) {
-    const uint8_t *dgd_ij = dgd + j;
-    const uint8_t X1 = src[j];
-    *sumX += X1;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        sumY[k][l] += D1;
-        M_int[k][l] += D1 * X1;
-
-        // The `acc_stat_sse41` function wants its input to have interleaved
-        // copies of two pixels, but we only have one. However, the pixels
-        // are (effectively) used as inputs to a multiply-accumulate.
-        // So if we set the extra pixel slot to 0, then it is effectively
-        // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
-        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-}
-
-static INLINE void compute_stats_win7_opt_sse4_1(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
-  int i, j, k, l, m, n;
-  const int wiener_win = WIENER_WIN;
-  const int pixel_count = (h_end - h_start) * (v_end - v_start);
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  const uint8_t avg =
-      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
-  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
-  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
-  int32_t sumX = 0;
-  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
-  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
-  for (j = v_start; j < v_end; j += 64) {
-    const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
-      acc_stat_win7_one_line_sse4_1(
-          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
-    }
-    for (k = 0; k < wiener_win; ++k) {
-      for (l = 0; l < wiener_win; ++l) {
-        M_int64[k][l] += M_int32[k][l];
-        M_int32[k][l] = 0;
-      }
-    }
-    for (k = 0; k < WIENER_WIN2; ++k) {
-      for (l = 0; l < WIENER_WIN * 8; ++l) {
-        H_int64[k][l] += H_int32[k][l];
-        H_int32[k][l] = 0;
-      }
-    }
-  }
-
-  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
-  for (k = 0; k < wiener_win; k++) {
-    for (l = 0; l < wiener_win; l++) {
-      const int32_t idx0 = l * wiener_win + k;
-      M[idx0] =
-          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
-      int64_t *H_ = H + idx0 * wiener_win2;
-      int64_t *H_int_ = &H_int64[idx0][0];
-      for (m = 0; m < wiener_win; m++) {
-        for (n = 0; n < wiener_win; n++) {
-          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
-                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
-        }
-      }
-    }
-  }
-}
-
 static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
                                          const __m128i *shuffle,
                                          const __m128i *dgd_ijkl) {
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 785371a..1fa219d 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -769,8 +769,6 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static const int kMaskMax = 64;
-
 typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
 
 template <typename FunctionType>